From 54995ad6ab8fc94893a9a0c3ea9506ee28c3d278 Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 22 Jan 2025 13:11:43 +0100 Subject: [PATCH] Further tweaks based on __manifest__.py files Loading configs now works --- poetry.lock | 43 ++++- pyproject.toml | 1 + src/auto_archiver/core/config.py | 34 ++-- src/auto_archiver/core/loader.py | 58 ++++++- src/auto_archiver/core/orchestrator.py | 158 ++++++++++++------ .../modules/generic_extractor/__manifest__.py | 4 +- .../twitter_api_archiver/__manifest__.py | 2 +- 7 files changed, 214 insertions(+), 86 deletions(-) diff --git a/poetry.lock b/poetry.lock index 40d108a..bbfb975 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1043,7 +1043,7 @@ version = "3.0.0" description = "Python port of markdown-it. Markdown parsing, done right!" optional = false python-versions = ">=3.8" -groups = ["docs"] +groups = ["main", "docs"] files = [ {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, @@ -1179,7 +1179,7 @@ version = "0.1.2" description = "Markdown URL utilities" optional = false python-versions = ">=3.7" -groups = ["docs"] +groups = ["main", "docs"] files = [ {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, @@ -1654,7 +1654,7 @@ version = "2.19.1" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" -groups = ["docs"] +groups = ["main", "docs"] files = [ {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, @@ -2031,6 +2031,41 @@ files = [ [package.dependencies] six = ">=1.7.0" +[[package]] +name = "rich" +version = "13.9.4" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.8.0" +groups = ["main"] +files = [ + {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"}, + {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0" +pygments = ">=2.13.0,<3.0.0" +typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""} + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + +[[package]] +name = "rich-argparse" +version = "1.6.0" +description = "Rich help formatters for argparse and optparse" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "rich_argparse-1.6.0-py3-none-any.whl", hash = "sha256:fbe70a1d821b3f2fa8958cddf0cae131870a6e9faa04ab52b409cb1eda809bd7"}, + {file = "rich_argparse-1.6.0.tar.gz", hash = "sha256:092083c30da186f25bcdff8b1d47fdfb571288510fb051e0488a72cc3128de13"}, +] + +[package.dependencies] +rich = ">=11.0.0" + [[package]] name = "rsa" version = "4.9" @@ -2966,4 +3001,4 @@ test = ["pytest (>=8.1,<9.0)", "pytest-rerunfailures (>=14.0,<15.0)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "462c7c5f9d1fbae895d6299ba0b690b6e24d0655a4c9fc79f75ddef4eec222f8" +content-hash = "911543169cbd6c68ab3392a052ea58917539acdfbc6511e591f8a2b497443cdc" diff --git a/pyproject.toml b/pyproject.toml index ccfcae6..4f20c8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ dependencies = [ "retrying (>=0.0.0)", "tsp-client (>=0.0.0)", "certvalidator (>=0.0.0)", + "rich-argparse (>=1.6.0,<2.0.0)", ] [tool.poetry.group.dev.dependencies] diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 66c2eb5..db5b6d2 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -4,14 +4,11 @@ It supports CLI argument parsing, loading from YAML file, and overrides to allow flexible setup in various environments. """ - import argparse -import yaml +from configparser import ConfigParser from dataclasses import dataclass, field -# @dataclass -# class Config: # configurable_parents = [ # Feeder, # Enricher, @@ -50,21 +47,6 @@ from dataclasses import dataclass, field # parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml') # parser.add_argument('--version', action='version', version=__version__) -def format_config(config: dict) -> dict: - # Iterate over all step subclasses to gather default configs and CLI arguments - new_config = {} - for step, values in config['steps'].items(): - new_config[f"--{step}"] = values - - # format configurations - for name, confg_vals in config['configurations'].items(): - for key, value in confg_vals.items(): - assert "." not in key, "config key cannot contain '.'" - config_path = f"--{name}.{key}" - new_config[config_path] = value - - return new_config - class LoadFromFile (argparse.Action): def __call__ (self, parser, namespace, values, option_string = None): @@ -72,6 +54,14 @@ class LoadFromFile (argparse.Action): # parse arguments in the file and store them in the target namespace parser.parse_args(f.read().split(), namespace) -def read_yaml(yaml_filename: str) -> dict: - with open(yaml_filename, "r", encoding="utf-8") as inf: - return format_config(yaml.safe_load(inf)) +def read_config(config_filename: str) -> dict: + config = ConfigParser() + config.read(config_filename) + # setup basic format + if 'STEPS' not in config.sections(): + config.add_section("STEPS") + return config + +def store_config(config: ConfigParser, config_filename: str): + with open(config_filename, "w", encoding="utf-8") as outf: + config.write(outf) \ No newline at end of file diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py index 8b96198..d39f31e 100644 --- a/src/auto_archiver/core/loader.py +++ b/src/auto_archiver/core/loader.py @@ -1,11 +1,23 @@ import ast +from dataclasses import dataclass, field import os import copy from os.path import join, dirname from typing import List + +MODULE_TYPES = [ + 'feeder', + 'enricher', + 'archiver', + 'database', + 'storage', + 'formatter' +] + MANIFEST_FILE = "__manifest__.py" _DEFAULT_MANIFEST = { + 'name': '', 'author': 'Bellingcat', 'requires_setup': True, 'depends': [], @@ -13,20 +25,54 @@ _DEFAULT_MANIFEST = { 'external_dependencies': {}, 'entry_point': '', 'version': '1.0', - 'config': {} + 'configs': {} } -def load_manifest(module): +@dataclass +class Module: + name: str + display_name: str + type: list + entry_point: str + depends: list + external_dependencies: dict + requires_setup: bool + configs: dict + description: str + path: str + manifest: dict + + def __init__(self, module_name, path, manifest): + self.name = module_name + self.path = path + self.manifest = manifest + if manifest: + self.display_name = manifest['name'] + self.type = manifest['type'] + self.entry_point = manifest['entry_point'] + self.depends = manifest['depends'] + self.external_dependencies = manifest['external_dependencies'] + self.requires_setup = manifest['requires_setup'] + self.configs = manifest['configs'] + self.description = manifest['description'] + + def __repr__(self): + return f"Module<'{self.display_name}' ({self.name})>" + + + +def load_manifest(module_path): + print(f"Loading manifest for module {module_path}") # load the manifest file manifest = copy.deepcopy(_DEFAULT_MANIFEST) - with open(join(module, MANIFEST_FILE)) as f: + with open(join(module_path, MANIFEST_FILE)) as f: manifest.update(ast.literal_eval(f.read())) return manifest -def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[dict]: +def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[Module]: # search through all valid 'modules' paths. Default is 'modules' in the current directory - + # see odoo/modules/module.py -> get_modules def is_really_module(name): if os.path.isfile(join(name, MANIFEST_FILE)): @@ -46,6 +92,6 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals manifest = load_manifest(possible_module_path) else: manifest = {} - all_modules.append((possible_module, possible_module_path, manifest)) + all_modules.append(Module(possible_module, possible_module_path, manifest)) return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index f788203..0a2273f 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -9,6 +9,11 @@ from typing import Generator, Union, List from urllib.parse import urlparse from ipaddress import ip_address import argparse +import configparser +import os +from os.path import join, dirname + +from rich_argparse import RichHelpFormatter from .context import ArchivingContext @@ -20,14 +25,15 @@ from ..enrichers import Enricher from ..databases import Database from .metadata import Metadata from ..version import __version__ -from .config import read_yaml -from .loader import available_modules, load_manifest +from .config import read_config, store_config +from .loader import available_modules, Module, MODULE_TYPES import tempfile, traceback from loguru import logger DEFAULT_CONFIG_FILE = "orchestration.yaml" + class ArchivingOrchestrator: # def __init__(self, config: Config) -> None: @@ -45,95 +51,145 @@ class ArchivingOrchestrator: # logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") # self.cleanup() - def setup_parser(self): + def setup_basic_parser(self): parser = argparse.ArgumentParser( - # prog = "auto-archiver", add_help=False, - description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!", - epilog="Check the code at https://github.com/bellingcat/auto-archiver" + description=""" + Auto Archiver is a CLI tool to archive media/metadata from online URLs; + it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)! + """, + epilog="Check the code at https://github.com/bellingcat/auto-archiver", + formatter_class=RichHelpFormatter, ) - parser.add_argument('--config', action='store', dest='config_file', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE) + parser.add_argument('--config', action='store', dest="config_file", help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE) parser.add_argument('--version', action='version', version=__version__) parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple') # override the default 'help' so we can inject all the configs and show those parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit') - self.parser = parser - - def add_module_args(self, modules: list = None): - if not modules: - modules = available_modules(with_manifest=True) + parser.add_argument('-s', '--store', action='store_true', dest='store', help='Store the created config in the config file') + self.basic_parser = parser - for module_name, module_path, manifest in modules: - for name, kwargs in manifest['config'].items(): - kwargs['dest'] = f"{module_name}.{kwargs.pop('dest', name)}" - self.parser.add_argument(f"--{module_name}.{name}", **kwargs) + def setup_complete_parser(self, basic_config: dict, ini_config: dict, unused_args: list[str]) -> None: + parser = argparse.ArgumentParser( + parents = [self.basic_parser], + add_help=False, + ) - def show_help(self): - # for the help message, we want to load *all* possible modules and show the help - # add configs as arg parser arguments - self.add_module_args() - - self.parser.print_help() - exit() - - def setup_config(self, config: dict) -> None: # check what mode we're in + # if we have a config file, use that to decide which modules to load # if simple, we'll load just the modules that has requires_setup = False # if full, we'll load all modules - if self.config.mode == 'simple': - simple_modules = [module for module in available_modules(with_manifest=True) if not module[2]['requires_setup']] - self.add_module_args(simple_modules) + if ini_config: + # only load the modules enabled in config + enabled_modules = [] + for module_type in MODULE_TYPES: + try: + enabled_modules.extend(ini_config.get("STEPS", module_type)) + except configparser.NoOptionError: + pass - # now we add the --feeders, --enrichers, --archivers, --databases, --storages, and --formatter, and make them "required" - self.parser.add_argument('--feeders', action='store', nargs='*', dest='feeders', required=True, help='the feeders to use') - self.parser.add_argument('--enrichers', action='store', nargs='*', dest='enrichers', required=True, help='the enrichers to use') - self.parser.add_argument('--extractors', action='store', nargs='*', dest='extractors', required=True, help='the enrichers to use') - self.parser.add_argument('--databases', action='store', nargs='*', dest='databases', required=True, help='the databases to use') - self.parser.add_argument('--storages', action='store', nargs='*', dest='storages', required=True, help='the storages to use') - self.parser.add_argument('--formatter', action='store', nargs='*', dest='formatter', required=True, help='the formatter to use') + # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter' + for module_type in MODULE_TYPES: + if modules := getattr(basic_config, f"{module_type}s", []): + enabled_modules.extend(modules) + + self.add_module_args(available_modules(enabled_modules, with_manifest=True), parser) + elif basic_config.mode == 'simple': + simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] + self.add_module_args(simple_modules, parser) + # add them to the config + for module in simple_modules: + for module_type in module.type: + existing_modules = config['STEPS'] = module.name + ini_config.setdefault(f"{module_type}s", []).append(module.name) + + else: + # load all modules, they're not using the 'simple' mode + self.add_module_args(available_modules(with_manifest=True), parser) + + parser.set_defaults(**ini_config) - - config.update(self.config.__dict__) # reload the parser with the new arguments, now that we have them - self.config, unknown = self.parser.parse_known_args(config) - logger.warning(f"Ignoring unknown/unused arguments: {unknown}") + self.config, unknown = parser.parse_known_args(unused_args) + if unknown: + logger.warning(f"Ignoring unknown/unused arguments: {unknown}") + if self.config and basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)): + logger.info(f"Storing configuration file to {basic_config.config_file}") + store_config(ini_config, basic_config.config_file) breakpoint() - - logger.info(f"FEEDER: {self.config.feeders}") logger.info(f"ENRICHERS: {self.config.enrichers}") logger.info(f"ARCHIVERS: {self.config.archivers}") logger.info(f"DATABASES: {self.config.databases}") logger.info(f"STORAGES: {self.config.storages}") logger.info(f"FORMATTER: {self.formatter.name}") - + + def add_steps_args(self, parser: argparse.ArgumentParser = None): + if not parser: + parser = self.parser + parser.add_argument('--feeders', action='store', dest='feeders', nargs='+', required=True, help='the feeders to use') + parser.add_argument('--enrichers', action='store', dest='enrichers', nargs='+', required=True, help='the enrichers to use') + parser.add_argument('--archivers', action='store', dest='archivers', nargs='+', required=True, help='the archivers to use') + parser.add_argument('--databases', action='store', dest='databases', nargs='+', required=True, help='the databases to use') + parser.add_argument('--storages', action='store', dest='storages', nargs='+', required=True, help='the storages to use') + parser.add_argument('--formatter', action='store', dest='formatter', nargs='+', required=True, help='the formatter to use') + + def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None): + + if not modules: + modules = available_modules(with_manifest=True) + + for module in modules: + if not module.configs: + # this module has no configs, don't show anything in the help + # (TODO: do we want to show something about this module though, like a description?) + continue + + group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...") + for name, kwargs in module.configs.items(): + # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set + # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something + kwargs.pop('cli_set', None) + kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}" + kwargs['type'] = type(kwargs.get('type', 'str')) + group.add_argument(f"--{module.name}.{name}", **kwargs) + + def show_help(self): + # for the help message, we want to load *all* possible modules and show the help + # add configs as arg parser arguments + + self.add_steps_args(self.basic_parser) + self.add_module_args(parser=self.basic_parser) + + self.basic_parser.print_help() + exit() def run(self) -> None: - self.setup_parser() + self.setup_basic_parser() # parse the known arguments for now (basically, we want the config file) # load the config file to get the list of enabled items - self.config, _ = self.parser.parse_known_args() + basic_config, unused_args = self.basic_parser.parse_known_args() # if help flag was called, then show the help - if self.config.help: + if basic_config.help: self.show_help() + # load the config file - config = {} + ini_config = {} try: - config = read_yaml(self.config.config_file) + ini_config = read_config(basic_config.config_file) except FileNotFoundError: - if self.config.config_file != DEFAULT_CONFIG_FILE: - logger.error(f"The configuration file {self.config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") + if basic_config.config_file != DEFAULT_CONFIG_FILE: + logger.error(f"The configuration file {basic_config.config_file} was not found. Make sure the file exists and try again, or run without the --config file to use the default settings.") exit() - self.setup_config(config) + self.setup_complete_parser(basic_config, ini_config, unused_args) - breakpoint() config.parse() diff --git a/src/auto_archiver/modules/generic_extractor/__manifest__.py b/src/auto_archiver/modules/generic_extractor/__manifest__.py index 673399e..d9d0669 100644 --- a/src/auto_archiver/modules/generic_extractor/__manifest__.py +++ b/src/auto_archiver/modules/generic_extractor/__manifest__.py @@ -30,7 +30,7 @@ the broader archiving framework. custom dropins can be created to handle additional websites and passed to the archiver via the command line using the `--dropins` option (TODO!). """, - 'config': { + 'configs': { "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"}, "subtitles": {"default": True, "help": "download subtitles if available"}, "comments": {"default": False, "help": "download all comments if available, may lead to large metadata"}, @@ -40,7 +40,7 @@ via the command line using the `--dropins` option (TODO!). "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."}, 'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."}, "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."}, - "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"}, + "cookies_from_browser": {"default": None, 'type': 'str', "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"}, "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"}, } } \ No newline at end of file diff --git a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py index f4eb2b9..b415679 100644 --- a/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py +++ b/src/auto_archiver/modules/twitter_api_archiver/__manifest__.py @@ -13,7 +13,7 @@ }, "configs": { "bearer_token": {"default": None, "help": "[deprecated: see bearer_tokens] twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, - "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line", "cli_set": lambda cli_val, cur_val: list(set(cli_val.split(",")))}, + "bearer_tokens": {"default": [], "help": " a list of twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret, if provided you can still add those for better rate limits. CSV of bearer tokens if provided via the command line"}, "consumer_key": {"default": None, "help": "twitter API consumer_key"}, "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, "access_token": {"default": None, "help": "twitter API access_token"},