From ade5ea0f6f5f8715ea22aa8df32664e905e52b6a Mon Sep 17 00:00:00 2001 From: Patrick Robertson Date: Wed, 22 Jan 2025 18:45:58 +0100 Subject: [PATCH] Tidy up imports + start on loading modules - program now starts much faster --- src/auto_archiver/__init__.py | 6 --- src/auto_archiver/__main__.py | 3 +- src/auto_archiver/core/__init__.py | 4 -- src/auto_archiver/core/config.py | 3 -- src/auto_archiver/core/loader.py | 60 +++++++++++++++++++++-- src/auto_archiver/core/media.py | 9 ++-- src/auto_archiver/core/orchestrator.py | 61 +++++++++++++----------- src/auto_archiver/databases/__init__.py | 8 +--- src/auto_archiver/enrichers/__init__.py | 12 ----- src/auto_archiver/feeders/__init__.py | 4 -- src/auto_archiver/formatters/__init__.py | 3 -- src/auto_archiver/storages/__init__.py | 7 +-- 12 files changed, 97 insertions(+), 83 deletions(-) delete mode 100644 src/auto_archiver/__init__.py diff --git a/src/auto_archiver/__init__.py b/src/auto_archiver/__init__.py deleted file mode 100644 index 307716d..0000000 --- a/src/auto_archiver/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from . import archivers, databases, enrichers, feeders, formatters, storages, utils, core - -# need to manually specify due to cyclical deps -from .core.orchestrator import ArchivingOrchestrator -# making accessible directly -from .core.metadata import Metadata diff --git a/src/auto_archiver/__main__.py b/src/auto_archiver/__main__.py index 8b2a65a..d31ec5c 100644 --- a/src/auto_archiver/__main__.py +++ b/src/auto_archiver/__main__.py @@ -1,6 +1,5 @@ """ Entry point for the auto_archiver package. """ -from . import ArchivingOrchestrator - +from auto_archiver.core.orchestrator import ArchivingOrchestrator def main(): ArchivingOrchestrator().run() diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index b78df83..779d3ac 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -1,10 +1,6 @@ """ Core modules to handle things such as orchestration, metadata and configs.. """ -from .metadata import Metadata -from .media import Media -from .step import Step -from .context import ArchivingContext # cannot import ArchivingOrchestrator/Config to avoid circular dep # from .orchestrator import ArchivingOrchestrator diff --git a/src/auto_archiver/core/config.py b/src/auto_archiver/core/config.py index 9709be6..f5d9fae 100644 --- a/src/auto_archiver/core/config.py +++ b/src/auto_archiver/core/config.py @@ -61,9 +61,6 @@ class LoadFromFile (argparse.Action): def to_dot_notation(yaml_conf: str) -> argparse.ArgumentParser: dotdict = {} - for step, vals in yaml_conf.pop('steps', {}).items(): - if vals: - dotdict[f"{step}s"] = vals def process_subdict(subdict, prefix=""): for key, value in subdict.items(): diff --git a/src/auto_archiver/core/loader.py b/src/auto_archiver/core/loader.py index 4460349..aa03b1f 100644 --- a/src/auto_archiver/core/loader.py +++ b/src/auto_archiver/core/loader.py @@ -4,12 +4,14 @@ import os import copy from os.path import join, dirname from typing import List - +from loguru import logger +import sys +import shutil MODULE_TYPES = [ 'feeder', 'enricher', - 'archiver', + 'extractor', 'database', 'storage', 'formatter' @@ -59,7 +61,44 @@ class Module: def __repr__(self): return f"Module<'{self.display_name}' ({self.name})>" +def load_modules(modules): + modules = available_modules(limit_to_modules=modules, with_manifest=True) + for module in modules: + _load_module(module) +def _load_module(module): + # first make sure that the 'depends' are installed and available in sys.args + for dependency in module.depends: + if dependency not in sys.modules: + logger.error(f""" + Module {module.name} depends on {dependency} which is not available. + + Have you set up the '{module.name}' module correctly? See the README for more information. + """) + exit() + # then check the external dependencies, these are binary dependencies that should be available on the path + for dep_type, deps in module.external_dependencies.items(): + if dep_type == 'python': + for dep in deps: + if dep not in sys.modules: + logger.error(f""" + Module {module.name} requires {dep} which is not available. + + Have you installed the required dependencies for the '{module.name}' module? See the README for more information. + """) + + elif dep_type == 'binary': + for dep in deps: + if not shutil.which(dep): + logger.error(f""" + Module {module.name} requires {dep} which is not available. + + Have you installed the required dependencies for the '{module.name}' module? See the README for more information. + """) + # finally, load the module + logger.info(f"Loading module {module.display_name}") + module = __import__(module.entry_point, fromlist=[module.entry_point]) + logger.info(f"Module {module.display_name} loaded") def load_manifest(module_path): # print(f"Loading manifest for module {module_path}") @@ -70,7 +109,7 @@ def load_manifest(module_path): manifest.update(ast.literal_eval(f.read())) return manifest -def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[Module]: +def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], ) -> List[Module]: # search through all valid 'modules' paths. Default is 'modules' in the current directory # see odoo/modules/module.py -> get_modules @@ -83,7 +122,16 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals for module_folder in default_path + additional_paths: # walk through each module in module_folder and check if it has a valid manifest - for possible_module in os.listdir(module_folder): + try: + possible_modules = os.listdir(module_folder) + except FileNotFoundError: + logger.warning(f"Module folder {module_folder} does not exist") + continue + + for possible_module in possible_modules: + if limit_to_modules and possible_module not in limit_to_modules: + continue + possible_module_path = join(module_folder, possible_module) if not is_really_module(possible_module_path): continue @@ -93,5 +141,9 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals else: manifest = {} all_modules.append(Module(possible_module, possible_module_path, manifest)) + + for module in limit_to_modules: + if not any(module == m.name for m in all_modules): + logger.warning(f"Module {module} not found in available modules. Are you sure it's installed?") return all_modules \ No newline at end of file diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index d204a6e..e5026af 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -11,9 +11,6 @@ from dataclasses import dataclass, field from dataclasses_json import dataclass_json, config import mimetypes -import ffmpeg -from ffmpeg._run import Error - from .context import ArchivingContext from loguru import logger @@ -106,6 +103,12 @@ class Media: return self.mimetype.startswith("image") def is_valid_video(self) -> bool: + # Note: this is intentional, to only import ffmpeg here - when the method is called + # this speeds up loading the module. We check that 'ffmpeg' is available on startup + # when we load each manifest file + import ffmpeg + from ffmpeg._run import Error + # checks for video streams with ffmpeg, or min file size for a video # self.is_video() should be used together with this method try: diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index f8df659..ee3a190 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -16,16 +16,10 @@ from rich_argparse import RichHelpFormatter from .context import ArchivingContext -from ..archivers import Archiver -from ..feeders import Feeder -from ..formatters import Formatter -from ..storages import Storage -from ..enrichers import Enricher -from ..databases import Database from .metadata import Metadata from ..version import __version__ from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG -from .loader import available_modules, Module, MODULE_TYPES +from .loader import available_modules, Module, MODULE_TYPES, load_modules import tempfile, traceback from loguru import logger @@ -74,7 +68,7 @@ class ArchivingOrchestrator: add_help=False, ) self.add_steps_args(parser) - breakpoint() + # check what mode we're in # if we have a config file, use that to decide which modules to load # if simple, we'll load just the modules that has requires_setup = False @@ -91,7 +85,7 @@ class ArchivingOrchestrator: if modules := getattr(basic_config, f"{module_type}s", []): enabled_modules.extend(modules) - self.add_module_args(available_modules(enabled_modules, with_manifest=True), parser) + self.add_module_args(available_modules(with_manifest=True, limit_to_modules=enabled_modules), parser) elif basic_config.mode == 'simple': simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] self.add_module_args(simple_modules, parser) @@ -103,7 +97,7 @@ class ArchivingOrchestrator: # load all modules, they're not using the 'simple' mode self.add_module_args(available_modules(with_manifest=True), parser) - breakpoint() + parser.set_defaults(**to_dot_notation(yaml_config)) # reload the parser with the new arguments, now that we have them @@ -114,27 +108,30 @@ class ArchivingOrchestrator: # merge the new config with the old one yaml_config = merge_dicts(vars(parsed), yaml_config) - if self.config and basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)): + if basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)): logger.info(f"Storing configuration file to {basic_config.config_file}") store_yaml(yaml_config, basic_config.config_file) - breakpoint() - logger.info(f"FEEDER: {self.config.feeders}") - logger.info(f"ENRICHERS: {self.config.enrichers}") - logger.info(f"ARCHIVERS: {self.config.archivers}") - logger.info(f"DATABASES: {self.config.databases}") - logger.info(f"STORAGES: {self.config.storages}") - logger.info(f"FORMATTER: {self.formatter.name}") + + self.config = yaml_config + + logger.info("FEEDERS: " + ", ".join(self.config['steps']['feeders'])) + logger.info("EXTRACTORS: " + ", ".join(self.config['steps']['extractors'])) + logger.info("ENRICHERS: " + ", ".join(self.config['steps']['enrichers'])) + logger.info("DATABASES: " + ", ".join(self.config['steps']['databases'])) + logger.info("STORAGES: " + ", ".join(self.config['steps']['storages'])) + logger.info("FORMATTERS: " + ", ".join(self.config['steps']['formatters'])) + return self.config def add_steps_args(self, parser: argparse.ArgumentParser = None): if not parser: parser = self.parser - parser.add_argument('--feeders', action='store', dest='feeders', nargs='+', required=True, help='the feeders to use') - parser.add_argument('--enrichers', action='store', dest='enrichers', nargs='+', required=True, help='the enrichers to use') - parser.add_argument('--archivers', action='store', dest='archivers', nargs='+', required=True, help='the archivers to use') - parser.add_argument('--databases', action='store', dest='databases', nargs='+', required=True, help='the databases to use') - parser.add_argument('--storages', action='store', dest='storages', nargs='+', required=True, help='the storages to use') - parser.add_argument('--formatter', action='store', dest='formatter', nargs='+', required=True, help='the formatter to use') + parser.add_argument('--feeders', action='store', dest='steps.feeders', nargs='+', required=True, help='the feeders to use') + parser.add_argument('--enrichers', action='store', dest='steps.enrichers', nargs='+', required=True, help='the enrichers to use') + parser.add_argument('--extractors', action='store', dest='steps.extractors', nargs='+', required=True, help='the extractors to use') + parser.add_argument('--databases', action='store', dest='steps.databases', nargs='+', required=True, help='the databases to use') + parser.add_argument('--storages', action='store', dest='steps.storages', nargs='+', required=True, help='the storages to use') + parser.add_argument('--formatters', action='store', dest='steps.formatters', nargs='+', required=True, help='the formatter to use') def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None): @@ -165,6 +162,12 @@ class ArchivingOrchestrator: self.basic_parser.print_help() exit() + + def install_modules(self): + modules = set() + [modules.update(*m) for m in self.config['steps'].values()] + + load_modules(modules) def run(self) -> None: self.setup_basic_parser() @@ -187,11 +190,10 @@ class ArchivingOrchestrator: yaml_config = read_yaml(basic_config.config_file) - + breakpoint() self.setup_complete_parser(basic_config, yaml_config, unused_args) - config.parse() - + self.install_modules() for item in self.feed(): pass @@ -201,8 +203,9 @@ class ArchivingOrchestrator: for a in self.all_archivers_for_setup(): a.cleanup() def feed(self) -> Generator[Metadata]: - for item in self.feeder: - yield self.feed_item(item) + for feeder in self.config['steps']['feeders']: + for item in feeder: + yield self.feed_item(item) self.cleanup() def feed_item(self, item: Metadata) -> Metadata: diff --git a/src/auto_archiver/databases/__init__.py b/src/auto_archiver/databases/__init__.py index 4c73896..5aaa679 100644 --- a/src/auto_archiver/databases/__init__.py +++ b/src/auto_archiver/databases/__init__.py @@ -1,10 +1,4 @@ """ Databases are used to store the outputs from running the Autp Archiver. -""" -from .database import Database -from .gsheet_db import GsheetsDb -from .console_db import ConsoleDb -from .csv_db import CSVDb -from .api_db import AAApiDb -from .atlos_db import AtlosDb \ No newline at end of file +""" \ No newline at end of file diff --git a/src/auto_archiver/enrichers/__init__.py b/src/auto_archiver/enrichers/__init__.py index 64ce248..67cb0e5 100644 --- a/src/auto_archiver/enrichers/__init__.py +++ b/src/auto_archiver/enrichers/__init__.py @@ -10,15 +10,3 @@ Enrichers are optional but highly useful for making the archived data more power """ -from .enricher import Enricher -from .screenshot_enricher import ScreenshotEnricher -from .wayback_enricher import WaybackArchiverEnricher -from .hash_enricher import HashEnricher -from .thumbnail_enricher import ThumbnailEnricher -from .wacz_enricher import WaczArchiverEnricher -from .whisper_enricher import WhisperEnricher -from .pdq_hash_enricher import PdqHashEnricher -from .metadata_enricher import MetadataEnricher -from .meta_enricher import MetaEnricher -from .ssl_enricher import SSLEnricher -from .timestamping_enricher import TimestampingEnricher \ No newline at end of file diff --git a/src/auto_archiver/feeders/__init__.py b/src/auto_archiver/feeders/__init__.py index 8117672..3eb33d7 100644 --- a/src/auto_archiver/feeders/__init__.py +++ b/src/auto_archiver/feeders/__init__.py @@ -1,7 +1,3 @@ """ Feeders handle the input of media into the Auto Archiver. """ -from.feeder import Feeder -from .gsheet_feeder import GsheetsFeeder -from .cli_feeder import CLIFeeder -from .atlos_feeder import AtlosFeeder \ No newline at end of file diff --git a/src/auto_archiver/formatters/__init__.py b/src/auto_archiver/formatters/__init__.py index af96f15..1a9dcd0 100644 --- a/src/auto_archiver/formatters/__init__.py +++ b/src/auto_archiver/formatters/__init__.py @@ -1,4 +1 @@ """ Formatters for the output of the content. """ -from .formatter import Formatter -from .html_formatter import HtmlFormatter -from .mute_formatter import MuteFormatter \ No newline at end of file diff --git a/src/auto_archiver/storages/__init__.py b/src/auto_archiver/storages/__init__.py index bff83e6..0765833 100644 --- a/src/auto_archiver/storages/__init__.py +++ b/src/auto_archiver/storages/__init__.py @@ -1,8 +1,3 @@ """ This module contains the storage classes for the auto-archiver. -""" -from .storage import Storage -from .s3 import S3Storage -from .local import LocalStorage -from .gd import GDriveStorage -from .atlos import AtlosStorage \ No newline at end of file +""" \ No newline at end of file