Get parsing of manifest and combining with config file working

2025-01-21 20:03:10 +01:00 · 2025-01-21 20:03:10 +01:00 · 4830f99300
commit 4830f99300
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@ -5,22 +5,9 @@ flexible setup in various environments.

 """

-import importlib
 import argparse
 import yaml
 from dataclasses import dataclass, field
-from typing import List
-from collections import defaultdict
-from loguru import logger
-
-from ..archivers import Archiver
-from ..feeders import Feeder
-from ..databases import Database
-from ..formatters import Formatter
-from ..storages import Storage
-from ..enrichers import Enricher
-from . import Step
-from ..utils import update_nested_dict


 # @dataclass
@ -46,84 +33,45 @@ from ..utils import update_nested_dict
 #         self.cli_ops = {}
 #         self.config = {}

-#     def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}):
-#         """
-#         if yaml_config_filename is provided, the --config argument is ignored, 
-#         useful for library usage when the config values are preloaded
-#         overwrite_configs is a dict that overwrites the yaml file contents
-#         """
-#         # 1. parse CLI values
-#         if use_cli:
-#             parser = argparse.ArgumentParser(
-#                 # prog = "auto-archiver",
-#                 description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
-#                 epilog="Check the code at https://github.com/bellingcat/auto-archiver"
-#             )
+    # def parse(self, use_cli=True, yaml_config_filename: str = None, overwrite_configs: str = {}):
+    #     """
+    #     if yaml_config_filename is provided, the --config argument is ignored, 
+    #     useful for library usage when the config values are preloaded
+    #     overwrite_configs is a dict that overwrites the yaml file contents
+    #     """
+        # # 1. parse CLI values
+        # if use_cli:
+        #     parser = argparse.ArgumentParser(
+        #         # prog = "auto-archiver",
+        #         description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
+        #         epilog="Check the code at https://github.com/bellingcat/auto-archiver"
+        #     )

-#             parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
-#             parser.add_argument('--version', action='version', version=__version__)
+        #     parser.add_argument('--config', action='store', dest='config', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default='orchestration.yaml')
+        #     parser.add_argument('--version', action='version', version=__version__)

-#         # Iterate over all step subclasses to gather default configs and CLI arguments
-#         for configurable in self.configurable_parents:
-#             child: Step
-#             for child in configurable.__subclasses__():
-#                 assert child.configs() is not None and type(child.configs()) == dict, f"class '{child.name}' should have a configs method returning a dict."
-#                 for config, details in child.configs().items():
-#                     assert "." not in child.name, f"class prop name cannot contain dots('.'): {child.name}"
-#                     assert "." not in config, f"config property cannot contain dots('.'): {config}"
-#                     config_path = f"{child.name}.{config}"
+def format_config(config: dict) -> dict:
+    # Iterate over all step subclasses to gather default configs and CLI arguments
+    new_config = {}
+    for step, values in config['steps'].items():
+        new_config[f"--{step}"] = values
+    
+    # format configurations
+    for name, confg_vals in config['configurations'].items():
+        for key, value in confg_vals.items():
+            assert "." not in key, "config key cannot contain '.'"
+            config_path = f"--{name}.{key}"
+            new_config[config_path] = value

-#                     if use_cli:
-#                         try:
-#                             parser.add_argument(f'--{config_path}', action='store', dest=config_path, help=f"{details['help']} (defaults to {details['default']})", choices=details.get("choices", None))
-#                         except argparse.ArgumentError:
-#                             # captures cases when a Step is used in 2 flows, eg: wayback enricher vs wayback archiver
-#                             pass
+    return new_config

-#                     self.defaults[config_path] = details["default"]
-#                     if "cli_set" in details:
-#                         self.cli_ops[config_path] = details["cli_set"]

-#         if use_cli:
-#             args = parser.parse_args()
-#             yaml_config_filename = yaml_config_filename or getattr(args, "config")
-#         else: args = {}
-
-#         # 2. read YAML config file (or use provided value)
-#         self.yaml_config = self.read_yaml(yaml_config_filename)
-#         update_nested_dict(self.yaml_config, overwrite_configs)
-
-#         # 3. CONFIGS: decide value with priority: CLI >> config.yaml >> default
-#         self.config = defaultdict(dict)
-#         for config_path, default in self.defaults.items():
-#             child, config = tuple(config_path.split("."))
-#             val = getattr(args, config_path, None)
-#             if val is not None and config_path in self.cli_ops:
-#                 val = self.cli_ops[config_path](val, default)
-#             if val is None:
-#                 val = self.yaml_config.get("configurations", {}).get(child, {}).get(config, default)
-#             self.config[child][config] = val
-#         self.config = dict(self.config)
-
-#         # 4. STEPS: read steps and validate they exist
-#         steps = self.yaml_config.get("steps", {})
-#         assert "archivers" in steps, "your configuration steps are missing the archivers property"
-#         assert "storages" in steps, "your configuration steps are missing the storages property"
-
-#         self.feeder = Feeder.init(steps.get("feeder", "cli_feeder"), self.config)
-#         self.formatter = Formatter.init(steps.get("formatter", "mute_formatter"), self.config)
-#         self.enrichers = [Enricher.init(e, self.config) for e in steps.get("enrichers", [])]
-#         self.archivers = [Archiver.init(e, self.config) for e in (steps.get("archivers") or [])]
-#         self.databases = [Database.init(e, self.config) for e in steps.get("databases", [])]
-#         self.storages = [Storage.init(e, self.config) for e in steps.get("storages", [])]
-
-#         logger.info(f"FEEDER: {self.feeder.name}")
-#         logger.info(f"ENRICHERS: {[x.name for x in self.enrichers]}")
-#         logger.info(f"ARCHIVERS: {[x.name for x in self.archivers]}")
-#         logger.info(f"DATABASES: {[x.name for x in self.databases]}")
-#         logger.info(f"STORAGES: {[x.name for x in self.storages]}")
-#         logger.info(f"FORMATTER: {self.formatter.name}")
+class LoadFromFile (argparse.Action):
+    def __call__ (self, parser, namespace, values, option_string = None):
+        with values as f:
+            # parse arguments in the file and store them in the target namespace
+            parser.parse_args(f.read().split(), namespace)

 def read_yaml(yaml_filename: str) -> dict:
    with open(yaml_filename, "r", encoding="utf-8") as inf:
-        return yaml.safe_load(inf)
+        return format_config(yaml.safe_load(inf))
--- a/src/auto_archiver/core/loader.py
+++ b/src/auto_archiver/core/loader.py
@ -1,4 +1,6 @@
+import ast
 import os
+import copy
 from os.path import join, dirname
 from typing import List

@ -11,15 +13,18 @@ _DEFAULT_MANIFEST = {
    'external_dependencies': {},
    'entry_point': '',
    'version': '1.0',
+    'config': {}
 }

-def load_manifest(self, module):
+def load_manifest(module):
    # load the manifest file
+    manifest = copy.deepcopy(_DEFAULT_MANIFEST)
+
    with open(join(module, MANIFEST_FILE)) as f:
-        manifest = f.read()
+        manifest.update(ast.literal_eval(f.read()))
    return manifest

-def available_modules(self, additional_paths: List[str] = []) -> List[dict]:
+def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[dict]:
    # search through all valid 'modules' paths. Default is 'modules' in the current directory

    # see odoo/modules/module.py -> get_modules
@ -32,11 +37,15 @@ def available_modules(self, additional_paths: List[str] = []) -> List[dict]:

    for module_folder in default_path + additional_paths:
        # walk through each module in module_folder and check if it has a valid manifest
-        for folder in os.listdir(module_folder):
-            possible_module = join(module_folder, folder)
-            if not is_really_module(possible_module):
+        for possible_module in os.listdir(module_folder):
+            possible_module_path = join(module_folder, possible_module)
+            if not is_really_module(possible_module_path):
                continue
            # parse manifest and add to list of available modules
-            all_modules.append(possible_module)
+            if with_manifest:
+                manifest = load_manifest(possible_module_path)
+            else:
+                manifest = {}
+            all_modules.append((possible_module, possible_module_path, manifest))

    return all_modules
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -5,9 +5,6 @@
 """

 from __future__ import annotations
-import ast
-import os
-from os.path import dirname, join
 from typing import Generator, Union, List
 from urllib.parse import urlparse
 from ipaddress import ip_address
@ -51,23 +48,67 @@ class ArchivingOrchestrator:
    def setup_parser(self):
        parser = argparse.ArgumentParser(
                # prog = "auto-archiver",
+                add_help=False,
                description="Auto Archiver is a CLI tool to archive media/metadata from online URLs; it can read URLs from many sources (Google Sheets, Command Line, ...); and write results to many destinations too (CSV, Google Sheets, MongoDB, ...)!",
                epilog="Check the code at https://github.com/bellingcat/auto-archiver"
        )
        parser.add_argument('--config', action='store', dest='config_file', help='the filename of the YAML configuration file (defaults to \'config.yaml\')', default=DEFAULT_CONFIG_FILE)
        parser.add_argument('--version', action='version', version=__version__)
        parser.add_argument('--mode', action='store', dest='mode', type=str, choices=['simple', 'full'], help='the mode to run the archiver in', default='simple')
+        # override the default 'help' so we can inject all the configs and show those
+        parser.add_argument('-h', '--help', action='store_true', dest='help', help='show this help message and exit')
        self.parser = parser
    
-    def setup_config(self):
+    def add_module_args(self, modules: list = None):
+        if not modules:
+            modules = available_modules(with_manifest=True)
+
+        for module_name, module_path, manifest in modules:
+            for name, kwargs in manifest['config'].items():
+                kwargs['dest'] = f"{module_name}.{kwargs.pop('dest', name)}"
+                self.parser.add_argument(f"--{module_name}.{name}", **kwargs)
+
+    def show_help(self):
+        # for the help message, we want to load *all* possible modules and show the help
+            # add configs as arg parser arguments
+        self.add_module_args()
+
+        self.parser.print_help()
+        exit()
+
+    def setup_config(self, config: dict) -> None:
        # check what mode we're in
        # if simple, we'll load just the modules that has requires_setup = False
        # if full, we'll load all modules
        if self.config.mode == 'simple':
-            for module in available_modules():
-                # load the module
-                manifest = load_manifest(module)
-                
+            simple_modules = [module for module in available_modules(with_manifest=True) if not module[2]['requires_setup']]
+            self.add_module_args(simple_modules)
+
+        # now we add the --feeders, --enrichers, --archivers, --databases, --storages, and --formatter, and make them "required"
+        self.parser.add_argument('--feeders', action='store', nargs='*', dest='feeders', required=True, help='the feeders to use')
+        self.parser.add_argument('--enrichers', action='store', nargs='*', dest='enrichers', required=True, help='the enrichers to use')
+        self.parser.add_argument('--extractors', action='store', nargs='*', dest='extractors', required=True, help='the enrichers to use')
+        self.parser.add_argument('--databases', action='store', nargs='*', dest='databases', required=True, help='the databases to use')
+        self.parser.add_argument('--storages', action='store', nargs='*', dest='storages', required=True, help='the storages to use')
+        self.parser.add_argument('--formatter', action='store', nargs='*', dest='formatter', required=True, help='the formatter to use')
+
+        
+        config.update(self.config.__dict__)
+        # reload the parser with the new arguments, now that we have them
+        self.config, unknown = self.parser.parse_known_args(config)
+        logger.warning(f"Ignoring unknown/unused arguments: {unknown}")
+
+        breakpoint()
+        
+
+        logger.info(f"FEEDER: {self.config.feeders}")
+        logger.info(f"ENRICHERS: {self.config.enrichers}")
+        logger.info(f"ARCHIVERS: {self.config.archivers}")
+        logger.info(f"DATABASES: {self.config.databases}")
+        logger.info(f"STORAGES: {self.config.storages}")
+        logger.info(f"FORMATTER: {self.formatter.name}")
+        
+

    def run(self) -> None:
        self.setup_parser()
@ -77,17 +118,21 @@ class ArchivingOrchestrator:
        # load the config file to get the list of enabled items
        self.config, _ = self.parser.parse_known_args()

+        # if help flag was called, then show the help
+        if self.config.help:
+            self.show_help()
        # load the config file
+        config = {}
+
        try:
            config = read_yaml(self.config.config_file)
        except FileNotFoundError:
-            if self.settings.config == DEFAULT_CONFIG_FILE:
-                # no config file found, let's do the setup with the default values
-                self.setup_config()
-            else:
+            if self.config.config_file != DEFAULT_CONFIG_FILE:
                logger.error(f"The configuration file {self.config.config_file} was  not found. Make sure the file exists and try again, or run without the --config file to use the default settings.")
                exit()

+        self.setup_config(config)
+
        breakpoint()
        config.parse()

--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@ -29,5 +29,18 @@ the broader archiving framework.
 metadata objects. Some dropins are included in this generic_archiver by default, but
 custom dropins can be created to handle additional websites and passed to the archiver
 via the command line using the `--dropins` option (TODO!).
-"""
+""",
+    'config': {
+            "facebook_cookie": {"default": None, "help": "optional facebook cookie to have more access to content, from browser, looks like 'cookie: datr= xxxx'"},
+            "subtitles": {"default": True, "help": "download subtitles if available"},
+            "comments": {"default": False, "help": "download all comments if available, may lead to large metadata"},
+            "livestreams": {"default": False, "help": "if set, will download live streams, otherwise will skip them; see --max-filesize for more control"},
+            "live_from_start": {"default": False, "help": "if set, will download live streams from their earliest available moment, otherwise starts now."},
+            "proxy": {"default": "", "help": "http/socks (https seems to not work atm) proxy to use for the webdriver, eg https://proxy-user:password@proxy-ip:port"},
+            "end_means_success": {"default": True, "help": "if True, any archived content will mean a 'success', if False this archiver will not return a 'success' stage; this is useful for cases when the yt-dlp will archive a video but ignore other types of content like images or text only pages that the subsequent archivers can retrieve."},
+            'allow_playlist': {"default": False, "help": "If True will also download playlists, set to False if the expectation is to download a single video."},
+            "max_downloads": {"default": "inf", "help": "Use to limit the number of videos to download when a channel or long page is being extracted. 'inf' means no limit."},
+            "cookies_from_browser": {"default": None, "help": "optional browser for ytdl to extract cookies from, can be one of: brave, chrome, chromium, edge, firefox, opera, safari, vivaldi, whale"},
+            "cookie_file": {"default": None, "help": "optional cookie file to use for Youtube, see instructions here on how to export from your browser: https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp"},
+        }
 }