From 1d2a1d4db7be58073428eff9a310e1cfc4268b5e Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Tue, 28 Jan 2025 11:14:12 +0100 Subject: [PATCH] Allow framework for config settings that should not be stored in config (e.g. cli_feeder.urls Use 'do_not_store': True in the config settings to apply this. Also: fix up generic archiver dropins loading + local_storage defaults (same as what's in example orchestration) --- src/auto_archiver/core/config.py | 15 ++++++-- src/auto_archiver/core/orchestrator.py | 34 +++++++++---------- .../modules/cli_feeder/__manifest__.py | 1 + .../generic_extractor/generic_extractor.py | 3 ++ .../modules/local_storage/__manifest__.py | 6 ++-- 5 files changed, 35 insertions(+), 24 deletions(-) diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index f724828..f98d64d 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -11,7 +11,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer from copy import deepcopy from .module import MODULE_TYPES -from typing import Any, List, Type +from typing import Any, List, Type, Tuple yaml = YAML() @@ -101,6 +101,15 @@ def read_yaml(yaml_filename: str) -> CommentedMap: return config -def store_yaml(config: CommentedMap, yaml_filename: str): +# TODO: make this tidier/find a way to notify of which keys should not be stored + + +def store_yaml(config: CommentedMap, yaml_filename: str, do_not_store_keys: List[Tuple[str, str]] = []) -> None: + config_to_save = deepcopy(config) + + for key1, key2 in do_not_store_keys: + if key1 in config_to_save and key2 in config_to_save[key1]: + del config_to_save[key1][key2] + with open(yaml_filename, "w", encoding="utf-8") as outf: - yaml.dump(config, outf) \ No newline at end of file + yaml.dump(config_to_save, outf) \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 4f155db..bc897ef 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -39,20 +39,7 @@ class UniqueAppendAction(argparse.Action): class ArchivingOrchestrator: - # def __init__(self, config: Config) -> None: - # self.feeder: Feeder = config.feeder - # self.formatter: Formatter = config.formatter - # self.enrichers: List[Enricher] = config.enrichers - # self.archivers: List[Archiver] = config.archivers - # self.databases: List[Database] = config.databases - # self.storages: List[Storage] = config.storages - # ArchivingContext.set("storages", self.storages, keep_on_reset=True) - - # try: - # for a in self.all_archivers_for_setup(): a.setup() - # except (KeyboardInterrupt, Exception) as e: - # logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") - # self.cleanup() + _do_not_store_keys = [] def setup_basic_parser(self): parser = argparse.ArgumentParser( @@ -125,10 +112,10 @@ class ArchivingOrchestrator: if unknown: logger.warning(f"Ignoring unknown/unused arguments: {unknown}\nPerhaps you don't have this module enabled?") - + if (self.config != yaml_config and basic_config.store) or not os.path.isfile(basic_config.config_file): logger.info(f"Storing configuration file to {basic_config.config_file}") - store_yaml(self.config, basic_config.config_file) + store_yaml(self.config, basic_config.config_file, self._do_not_store_keys) return self.config @@ -167,6 +154,10 @@ class ArchivingOrchestrator: for name, kwargs in module.configs.items(): # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something + do_not_store = kwargs.pop('do_not_store', False) + if do_not_store: + self._do_not_store_keys.append((module.name, name)) + kwargs.pop('cli_set', None) should_store = kwargs.pop('should_store', False) kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}" @@ -193,7 +184,7 @@ class ArchivingOrchestrator: logging_config = self.config['logging'] logger.add(sys.stderr, level=logging_config['level']) if log_file := logging_config['file']: - logger.add(log_file, rotation=logging_config['logging.rotation']) + logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) def install_modules(self): @@ -221,7 +212,14 @@ class ArchivingOrchestrator: if module in invalid_modules: continue loaded_module: BaseModule = get_module(module).load() - loaded_module.setup(self.config) + try: + loaded_module.setup(self.config) + except (KeyboardInterrupt, Exception) as e: + logger.error(f"Error during setup of archivers: {e}\n{traceback.format_exc()}") + if module_type == 'extractor': + loaded_module.cleanup() + exit() + if not loaded_module: invalid_modules.append(module) continue diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py index 4790a25..01ef2e7 100644 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -11,6 +11,7 @@ "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", "nargs": "+", "required": True, + "do_not_store": True, }, }, "description": """ diff --git a/src/auto_archiver/modules/generic_extractor/generic_extractor.py b/src/auto_archiver/modules/generic_extractor/generic_extractor.py index 57924d9..36fb71e 100644 --- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py +++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py @@ -121,6 +121,7 @@ class GenericExtractor(Extractor): ie_instance = info_extractor(downloader=ydl) dropin = self.dropin_for_name(info_extractor.ie_key()) + if not dropin: # TODO: add a proper link to 'how to create your own dropin' logger.debug(f"""Could not find valid dropin for {info_extractor.IE_NAME}. @@ -172,6 +173,8 @@ class GenericExtractor(Extractor): def dropin_for_name(self, dropin_name: str, additional_paths = [], package=__package__) -> Type[InfoExtractor]: + dropin_name = dropin_name.lower() + if dropin_name == "generic": # no need for a dropin for the generic extractor (?) return None diff --git a/src/auto_archiver/modules/local_storage/__manifest__.py b/src/auto_archiver/modules/local_storage/__manifest__.py index c012be0..ce00953 100644 --- a/src/auto_archiver/modules/local_storage/__manifest__.py +++ b/src/auto_archiver/modules/local_storage/__manifest__.py @@ -7,16 +7,16 @@ }, "configs": { "path_generator": { - "default": "url", + "default": "flat", "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", "choices": ["flat", "url", "random"], }, "filename_generator": { - "default": "random", + "default": "static", "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", "choices": ["random", "static"], }, - "save_to": {"default": "./archived", "help": "folder where to save archived content"}, + "save_to": {"default": "./local_archive", "help": "folder where to save archived content"}, "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"}, }, "description": """