From f68e2726f2a71578404cebf5658503d9051d8a2f Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Mon, 27 Jan 2025 14:01:36 +0100 Subject: [PATCH] Refactor loader + step into module, use LazyBaseModule and BaseModule --- src/auto_archiver/base_processors/database.py | 6 +- src/auto_archiver/base_processors/enricher.py | 5 +- .../base_processors/extractor.py | 2 +- src/auto_archiver/base_processors/feeder.py | 5 +- .../base_processors/formatter.py | 13 +- src/auto_archiver/base_processors/storage.py | 9 +- src/auto_archiver/core/__init__.py | 2 +- src/auto_archiver/core/config.py | 2 +- src/auto_archiver/core/loader.py | 173 ---------------- src/auto_archiver/core/module.py | 196 ++++++++++++++++++ src/auto_archiver/core/orchestrator.py | 30 +-- src/auto_archiver/core/step.py | 11 - src/auto_archiver/core/validators.py | 2 + .../modules/cli_feeder/__manifest__.py | 2 +- .../modules/cli_feeder/cli_feeder.py | 3 +- .../modules/csv_db/__manifest__.py | 2 +- 16 files changed, 232 insertions(+), 231 deletions(-) delete mode 100644 src/auto_archiver/core/loader.py create mode 100644 src/auto_archiver/core/module.py delete mode 100644 src/auto_archiver/core/step.py diff --git a/src/auto_archiver/base_processors/database.py b/src/auto_archiver/base_processors/database.py index 6f13208..f7deaef 100644 --- a/src/auto_archiver/base_processors/database.py +++ b/src/auto_archiver/base_processors/database.py @@ -3,13 +3,11 @@ from dataclasses import dataclass from abc import abstractmethod, ABC from typing import Union -from auto_archiver.core import Metadata, Step +from auto_archiver.core import Metadata, BaseModule @dataclass -class Database(Step, ABC): - - name = "database" +class Database(BaseModule): def started(self, item: Metadata) -> None: """signals the DB that the given item archival has started""" diff --git a/src/auto_archiver/base_processors/enricher.py b/src/auto_archiver/base_processors/enricher.py index 3cc1a29..fe0d05f 100644 --- a/src/auto_archiver/base_processors/enricher.py +++ b/src/auto_archiver/base_processors/enricher.py @@ -11,12 +11,11 @@ Enrichers are optional but highly useful for making the archived data more power from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod, ABC -from auto_archiver.core import Metadata, Step +from auto_archiver.core import Metadata, BaseModule @dataclass -class Enricher(Step, ABC): +class Enricher(BaseModule): """Base classes and utilities for enrichers in the Auto-Archiver system.""" - name = "enricher" @abstractmethod def enrich(self, to_enrich: Metadata) -> None: pass diff --git a/src/auto_archiver/base_processors/extractor.py b/src/auto_archiver/base_processors/extractor.py index c772325..321b053 100644 --- a/src/auto_archiver/base_processors/extractor.py +++ b/src/auto_archiver/base_processors/extractor.py @@ -25,7 +25,7 @@ class Extractor: Subclasses must implement the `download` method to define platform-specific behavior. """ - def setup(self) -> None: + def setup(self, *args, **kwargs) -> None: # used when extractors need to login or do other one-time setup pass diff --git a/src/auto_archiver/base_processors/feeder.py b/src/auto_archiver/base_processors/feeder.py index 0ff541e..e539f5f 100644 --- a/src/auto_archiver/base_processors/feeder.py +++ b/src/auto_archiver/base_processors/feeder.py @@ -2,12 +2,11 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod from auto_archiver.core import Metadata -from auto_archiver.core import Step +from auto_archiver.core import BaseModule @dataclass -class Feeder(Step): - name = "feeder" +class Feeder(BaseModule): @abstractmethod def __iter__(self) -> Metadata: return None \ No newline at end of file diff --git a/src/auto_archiver/base_processors/formatter.py b/src/auto_archiver/base_processors/formatter.py index 4c59af8..beb0c0d 100644 --- a/src/auto_archiver/base_processors/formatter.py +++ b/src/auto_archiver/base_processors/formatter.py @@ -1,20 +1,11 @@ from __future__ import annotations from dataclasses import dataclass from abc import abstractmethod -from auto_archiver.core import Metadata, Media, Step +from auto_archiver.core import Metadata, Media, BaseModule @dataclass -class Formatter(Step): - name = "formatter" - - def __init__(self, config: dict) -> None: - # without this STEP.__init__ is not called - super().__init__(config) - - def init(name: str, config: dict) -> Formatter: - # only for code typing - return Step.init(name, config, Formatter) +class Formatter(BaseModule): @abstractmethod def format(self, item: Metadata) -> Media: return None \ No newline at end of file diff --git a/src/auto_archiver/base_processors/storage.py b/src/auto_archiver/base_processors/storage.py index da6b2ef..e167024 100644 --- a/src/auto_archiver/base_processors/storage.py +++ b/src/auto_archiver/base_processors/storage.py @@ -6,19 +6,14 @@ import os from auto_archiver.utils.misc import random_str -from auto_archiver.core import Media, Step, ArchivingContext, Metadata +from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher from loguru import logger from slugify import slugify @dataclass -class Storage(Step): - name = "storage" - - def init(name: str, config: dict) -> Storage: - # only for typing... - return Step.init(name, config, Storage) +class Storage(BaseModule): def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None: if media.is_stored(): diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index b78df83..10213b2 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -3,7 +3,7 @@ """ from .metadata import Metadata from .media import Media -from .step import Step +from .module import BaseModule from .context import ArchivingContext # cannot import ArchivingOrchestrator/Config to avoid circular dep diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index fd5d49b..81a1c10 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -9,7 +9,7 @@ import argparse from ruamel.yaml import YAML, CommentedMap, add_representer from copy import deepcopy -from .loader import MODULE_TYPES +from .module import MODULE_TYPES from typing import Any, List, Type diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py deleted file mode 100644 index bbd686e..0000000 --- a/src/auto_archiver/core/loader.py +++ /dev/null @@ -1,173 +0,0 @@ -import ast -from typing import Type -from importlib.util import find_spec -from dataclasses import dataclass -import os -import copy -from os.path import join, dirname -from typing import List -from loguru import logger -import sys -import shutil - -_LOADED_MODULES = {} - -MODULE_TYPES = [ - 'feeder', - 'enricher', - 'extractor', - 'database', - 'storage', - 'formatter' -] - -MANIFEST_FILE = "__manifest__.py" -_DEFAULT_MANIFEST = { - 'name': '', - 'author': 'Bellingcat', - 'type': [], - 'requires_setup': True, - 'description': '', - 'dependencies': {}, - 'entry_point': '', - 'version': '1.0', - 'configs': {} -} - -@dataclass -class Module: - name: str - display_name: str - type: list - dependencies: dict - requires_setup: bool - configs: dict - description: str - path: str - manifest: dict - - def __init__(self, module_name, path, manifest): - self.name = module_name - self.path = path - self.manifest = manifest - if manifest: - self.display_name = manifest['name'] - self.type = manifest['type'] - self._entry_point = manifest['entry_point'] - self.dependencies = manifest['dependencies'] - self.requires_setup = manifest['requires_setup'] - self.configs = manifest['configs'] - self.description = manifest['description'] - - @property - def entry_point(self): - if not self._entry_point: - # try to create the entry point from the module name - self._entry_point = f"{self.name}::{self.name.replace('_', ' ').title().replace(' ', '')}" - return self._entry_point - - def __repr__(self): - return f"Module<'{self.display_name}' ({self.name})>" - -def load_module(module: str) -> object: # TODO: change return type to Step - - if module in _LOADED_MODULES: - return _LOADED_MODULES[module] - - # load a module by name - module = get_module(module) - if not module: - return None - # check external dependencies are installed - def check_deps(deps, check): - for dep in deps: - if not check(dep): - logger.error(f"Module '{module.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{module.name}' module? See the README for more information.") - exit(1) - - check_deps(module.dependencies.get('python', []), lambda dep: find_spec(dep)) - check_deps(module.dependencies.get('bin', []), lambda dep: shutil.which(dep)) - - qualname = f'auto_archiver.modules.{module.name}' - - logger.info(f"Loading module '{module.display_name}'...") - # first import the whole module, to make sure it's working properly - __import__(qualname) - - - # then import the file for the entry point - file_name, class_name = module.entry_point.split('::') - sub_qualname = f'{qualname}.{file_name}' - - __import__(f'{qualname}.{file_name}', fromlist=[module.entry_point]) - - # finally, get the class instance - instance = getattr(sys.modules[sub_qualname], class_name)() - if not getattr(instance, 'name', None): - instance.name = module.name - - _LOADED_MODULES[module.name] = instance - return _LOADED_MODULES[module.name] - - - # finally, load the module - -def load_manifest(module_path): - # print(f"Loading manifest for module {module_path}") - # load the manifest file - manifest = copy.deepcopy(_DEFAULT_MANIFEST) - - with open(join(module_path, MANIFEST_FILE)) as f: - try: - manifest.update(ast.literal_eval(f.read())) - except ( ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e: - logger.error(f"Error loading manifest from file {module_path}/{MANIFEST_FILE}: {e}") - return manifest - return manifest - -def get_module(module_name): - # get a module by name - try: - return available_modules(limit_to_modules=[module_name], with_manifest=True)[0] - except IndexError: - return None - -def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], suppress_warnings: bool = False) -> List[Module]: - # search through all valid 'modules' paths. Default is 'modules' in the current directory - - # see odoo/modules/module.py -> get_modules - def is_really_module(name): - if os.path.isfile(join(name, MANIFEST_FILE)): - return True - - default_path = [join(dirname(dirname((__file__))), "modules")] - all_modules = [] - - for module_folder in default_path + additional_paths: - # walk through each module in module_folder and check if it has a valid manifest - try: - possible_modules = os.listdir(module_folder) - except FileNotFoundError: - logger.warning(f"Module folder {module_folder} does not exist") - continue - - for possible_module in possible_modules: - if limit_to_modules and possible_module not in limit_to_modules: - continue - - possible_module_path = join(module_folder, possible_module) - if not is_really_module(possible_module_path): - continue - # parse manifest and add to list of available modules - if with_manifest: - manifest = load_manifest(possible_module_path) - else: - manifest = {} - all_modules.append(Module(possible_module, possible_module_path, manifest)) - - if not suppress_warnings: - for module in limit_to_modules: - if not any(module == m.name for m in all_modules): - logger.warning(f"Module '{module}' not found. Are you sure it's installed?") - - return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/module.py b/src/auto_archiver/core/module.py new file mode 100644 index 0000000..96a8e5e --- /dev/null +++ b/src/auto_archiver/core/module.py @@ -0,0 +1,196 @@ +""" +Defines the Step abstract base class, which acts as a blueprint for steps in the archiving pipeline +by handling user configuration, validating the steps properties, and implementing dynamic instantiation. + +""" +from __future__ import annotations + +from dataclasses import dataclass +from typing import List +from abc import ABC +import shutil +import ast +import copy +import sys +from importlib.util import find_spec +import os +from os.path import join, dirname +from loguru import logger + +_LAZY_LOADED_MODULES = {} + +MODULE_TYPES = [ + 'feeder', + 'extractor', + 'enricher', + 'database', + 'storage', + 'formatter' +] + +MANIFEST_FILE = "__manifest__.py" +_DEFAULT_MANIFEST = { + 'name': '', + 'author': 'Bellingcat', + 'type': [], + 'requires_setup': True, + 'description': '', + 'dependencies': {}, + 'entry_point': '', + 'version': '1.0', + 'configs': {} +} + +class BaseModule(ABC): + + config: dict + name: str + + def setup(self, config: dict): + self.config = config + for key, val in config.get(self.name, {}).items(): + setattr(self, key, val) + +def get_module(module_name: str, additional_paths: List[str] = []): + if module_name in _LAZY_LOADED_MODULES: + return _LAZY_LOADED_MODULES[module_name] + + module = available_modules(additional_paths=additional_paths, limit_to_modules=[module_name])[0] + _LAZY_LOADED_MODULES[module_name] = module + return module + +def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], suppress_warnings: bool = False) -> List[LazyBaseModule]: + # search through all valid 'modules' paths. Default is 'modules' in the current directory + + # see odoo/modules/module.py -> get_modules + def is_really_module(module_path): + if os.path.isfile(join(module_path, MANIFEST_FILE)): + return True + + default_path = [join(dirname(dirname((__file__))), "modules")] + all_modules = [] + + for module_folder in default_path + additional_paths: + # walk through each module in module_folder and check if it has a valid manifest + try: + possible_modules = os.listdir(module_folder) + except FileNotFoundError: + logger.warning(f"Module folder {module_folder} does not exist") + continue + + for possible_module in possible_modules: + if limit_to_modules and possible_module not in limit_to_modules: + continue + + possible_module_path = join(module_folder, possible_module) + if not is_really_module(possible_module_path): + continue + + all_modules.append(LazyBaseModule(possible_module, possible_module_path)) + + if not suppress_warnings: + for module in limit_to_modules: + if not any(module == m.name for m in all_modules): + logger.warning(f"Module '{module}' not found. Are you sure it's installed?") + + return all_modules + +@dataclass +class LazyBaseModule: + name: str + display_name: str + type: list + requires_setup: bool + description: str + path: str + + _manifest: dict = None + _instance: BaseModule = None + _entry_point: str = None + + def __init__(self, module_name, path): + self.name = module_name + self.path = path + + @property + def entry_point(self): + if not self._entry_point and not self.manifest['entry_point']: + # try to create the entry point from the module name + self._entry_point = f"{self.name}::{self.name.replace('_', ' ').title().replace(' ', '')}" + return self._entry_point + + @property + def dependencies(self): + return self.manifest['dependencies'] + + @property + def configs(self): + return self.manifest['configs'] + + @property + def manifest(self): + if self._manifest: + return self._manifest + # print(f"Loading manifest for module {module_path}") + # load the manifest file + manifest = copy.deepcopy(_DEFAULT_MANIFEST) + + with open(join(self.path, MANIFEST_FILE)) as f: + try: + manifest.update(ast.literal_eval(f.read())) + except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e: + logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}") + + self._manifest = manifest + self.display_name = manifest['name'] + self.type = manifest['type'] + self._entry_point = manifest['entry_point'] + self.requires_setup = manifest['requires_setup'] + self.description = manifest['description'] + + return manifest + + def load(self): + if self._instance: + return self._instance + + # check external dependencies are installed + def check_deps(deps, check): + for dep in deps: + if not check(dep): + logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") + exit(1) + + check_deps(self.dependencies.get('python', []), lambda dep: find_spec(dep)) + check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep)) + + + logger.debug(f"Loading module '{self.display_name}'...") + + for qualname in [self.name, f'auto_archiver.modules.{self.name}']: + try: + # first import the whole module, to make sure it's working properly + __import__(qualname) + break + except ImportError: + pass + + # then import the file for the entry point + file_name, class_name = self.entry_point.split('::') + sub_qualname = f'{qualname}.{file_name}' + + __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point]) + + # finally, get the class instance + instance = getattr(sys.modules[sub_qualname], class_name)() + if not getattr(instance, 'name', None): + instance.name = self.name + + if not getattr(instance, 'display_name', None): + instance.display_name = self.display_name + + self._instance = instance + return instance + + def __repr__(self): + return f"Module<'{self.display_name}' ({self.name})>" \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 2419b03..2a5cf4a 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -19,8 +19,9 @@ from .context import ArchivingContext from .metadata import Metadata from ..version import __version__ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG -from .loader import available_modules, Module, MODULE_TYPES, load_module +from .module import available_modules, LazyBaseModule, MODULE_TYPES, get_module from . import validators +from .module import BaseModule import tempfile, traceback from loguru import logger @@ -107,7 +108,7 @@ class ArchivingOrchestrator: else: # load all modules, they're not using the 'simple' mode self.add_module_args(available_modules(with_manifest=True), parser) - + parser.set_defaults(**to_dot_notation(yaml_config)) # reload the parser with the new arguments, now that we have them @@ -147,22 +148,27 @@ class ArchivingOrchestrator: parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None) parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None) - def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None): + # additional modules + parser.add_argument('--additional-modules', dest='additional_modules', nargs='+', help='additional paths to search for modules', action=UniqueAppendAction) + + def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None): if not modules: modules = available_modules(with_manifest=True) - module: Module + module: LazyBaseModule for module in modules: if not module.configs: # this module has no configs, don't show anything in the help # (TODO: do we want to show something about this module though, like a description?) continue group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...") + for name, kwargs in module.configs.items(): # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set # in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something kwargs.pop('cli_set', None) + kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}" try: kwargs['type'] = __builtins__.get(kwargs.get('type'), str) @@ -210,10 +216,11 @@ class ArchivingOrchestrator: logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}") exit() - for i, module in enumerate(modules_to_load): + for module in modules_to_load: if module in invalid_modules: continue - loaded_module = load_module(module) + loaded_module: BaseModule = get_module(module).load() + loaded_module.setup(self.config) if not loaded_module: invalid_modules.append(module) continue @@ -238,6 +245,8 @@ class ArchivingOrchestrator: if basic_config.help: self.show_help() + logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") + # load the config file yaml_config = {} @@ -252,12 +261,9 @@ class ArchivingOrchestrator: self.install_modules() - logger.info("FEEDERS: " + ", ".join(m.name for m in self.config['steps']['feeders'])) - logger.info("EXTRACTORS: " + ", ".join(m.name for m in self.config['steps']['extractors'])) - logger.info("ENRICHERS: " + ", ".join(m.name for m in self.config['steps']['enrichers'])) - logger.info("DATABASES: " + ", ".join(m.name for m in self.config['steps']['databases'])) - logger.info("STORAGES: " + ", ".join(m.name for m in self.config['steps']['storages'])) - logger.info("FORMATTERS: " + ", ".join(m.name for m in self.config['steps']['formatters'])) + # log out the modules that were loaded + for module_type in MODULE_TYPES: + logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in self.config['steps'][f"{module_type}s"])) for item in self.feed(): pass diff --git a/src/auto_archiver/core/step.py b/src/auto_archiver/core/step.py deleted file mode 100644 index 2be99c1..0000000 --- a/src/auto_archiver/core/step.py +++ /dev/null @@ -1,11 +0,0 @@ -""" -Defines the Step abstract base class, which acts as a blueprint for steps in the archiving pipeline -by handling user configuration, validating the steps properties, and implementing dynamic instantiation. - -""" - -from __future__ import annotations - -class Step: - # Nothing to see here :) - pass \ No newline at end of file diff --git a/src/auto_archiver/core/validators.py b/src/auto_archiver/core/validators.py index 2bd662a..681d564 100644 --- a/src/auto_archiver/core/validators.py +++ b/src/auto_archiver/core/validators.py @@ -3,3 +3,5 @@ def example_validator(value): return "example" in value +def positive_number(value): + return value > 0 \ No newline at end of file diff --git a/src/auto_archiver/modules/cli_feeder/__manifest__.py b/src/auto_archiver/modules/cli_feeder/__manifest__.py index 1769a60..4790a25 100644 --- a/src/auto_archiver/modules/cli_feeder/__manifest__.py +++ b/src/auto_archiver/modules/cli_feeder/__manifest__.py @@ -8,9 +8,9 @@ 'entry_point': 'cli_feeder::CLIFeeder', "configs": { "urls": { - "default": None, "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", "nargs": "+", + "required": True, }, }, "description": """ diff --git a/src/auto_archiver/modules/cli_feeder/cli_feeder.py b/src/auto_archiver/modules/cli_feeder/cli_feeder.py index c5f3b23..09c46d4 100644 --- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py +++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py @@ -5,11 +5,10 @@ from auto_archiver.core import Metadata, ArchivingContext class CLIFeeder(Feeder): - name = "cli_feeder" def __iter__(self) -> Metadata: for url in self.urls: - logger.debug(f"Processing {url}") + logger.debug(f"Processing URL: '{url}'") yield Metadata().set_url(url) ArchivingContext.set("folder", "cli") diff --git a/src/auto_archiver/modules/csv_db/__manifest__.py b/src/auto_archiver/modules/csv_db/__manifest__.py index d97d179..3131188 100644 --- a/src/auto_archiver/modules/csv_db/__manifest__.py +++ b/src/auto_archiver/modules/csv_db/__manifest__.py @@ -1,5 +1,5 @@ { - "name": "csv_db", + "name": "CSV Database", "type": ["database"], "requires_setup": False, "external_dependencies": {"python": ["loguru"]