Tidy up imports + start on loading modules - program now starts much faster

pull/183/head
Patrick Robertson 2025-01-22 18:45:58 +01:00
rodzic b6b085854c
commit ade5ea0f6f
12 zmienionych plików z 97 dodań i 83 usunięć

Wyświetl plik

@ -1,6 +0,0 @@
from . import archivers, databases, enrichers, feeders, formatters, storages, utils, core
# need to manually specify due to cyclical deps
from .core.orchestrator import ArchivingOrchestrator
# making accessible directly
from .core.metadata import Metadata

Wyświetl plik

@ -1,6 +1,5 @@
""" Entry point for the auto_archiver package. """
from . import ArchivingOrchestrator
from auto_archiver.core.orchestrator import ArchivingOrchestrator
def main():
ArchivingOrchestrator().run()

Wyświetl plik

@ -1,10 +1,6 @@
""" Core modules to handle things such as orchestration, metadata and configs..
"""
from .metadata import Metadata
from .media import Media
from .step import Step
from .context import ArchivingContext
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator

Wyświetl plik

@ -61,9 +61,6 @@ class LoadFromFile (argparse.Action):
def to_dot_notation(yaml_conf: str) -> argparse.ArgumentParser:
dotdict = {}
for step, vals in yaml_conf.pop('steps', {}).items():
if vals:
dotdict[f"{step}s"] = vals
def process_subdict(subdict, prefix=""):
for key, value in subdict.items():

Wyświetl plik

@ -4,12 +4,14 @@ import os
import copy
from os.path import join, dirname
from typing import List
from loguru import logger
import sys
import shutil
MODULE_TYPES = [
'feeder',
'enricher',
'archiver',
'extractor',
'database',
'storage',
'formatter'
@ -59,7 +61,44 @@ class Module:
def __repr__(self):
return f"Module<'{self.display_name}' ({self.name})>"
def load_modules(modules):
modules = available_modules(limit_to_modules=modules, with_manifest=True)
for module in modules:
_load_module(module)
def _load_module(module):
# first make sure that the 'depends' are installed and available in sys.args
for dependency in module.depends:
if dependency not in sys.modules:
logger.error(f"""
Module {module.name} depends on {dependency} which is not available.
Have you set up the '{module.name}' module correctly? See the README for more information.
""")
exit()
# then check the external dependencies, these are binary dependencies that should be available on the path
for dep_type, deps in module.external_dependencies.items():
if dep_type == 'python':
for dep in deps:
if dep not in sys.modules:
logger.error(f"""
Module {module.name} requires {dep} which is not available.
Have you installed the required dependencies for the '{module.name}' module? See the README for more information.
""")
elif dep_type == 'binary':
for dep in deps:
if not shutil.which(dep):
logger.error(f"""
Module {module.name} requires {dep} which is not available.
Have you installed the required dependencies for the '{module.name}' module? See the README for more information.
""")
# finally, load the module
logger.info(f"Loading module {module.display_name}")
module = __import__(module.entry_point, fromlist=[module.entry_point])
logger.info(f"Module {module.display_name} loaded")
def load_manifest(module_path):
# print(f"Loading manifest for module {module_path}")
@ -70,7 +109,7 @@ def load_manifest(module_path):
manifest.update(ast.literal_eval(f.read()))
return manifest
def available_modules(additional_paths: List[str] = [], with_manifest: bool=False) -> List[Module]:
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], ) -> List[Module]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
@ -83,7 +122,16 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals
for module_folder in default_path + additional_paths:
# walk through each module in module_folder and check if it has a valid manifest
for possible_module in os.listdir(module_folder):
try:
possible_modules = os.listdir(module_folder)
except FileNotFoundError:
logger.warning(f"Module folder {module_folder} does not exist")
continue
for possible_module in possible_modules:
if limit_to_modules and possible_module not in limit_to_modules:
continue
possible_module_path = join(module_folder, possible_module)
if not is_really_module(possible_module_path):
continue
@ -93,5 +141,9 @@ def available_modules(additional_paths: List[str] = [], with_manifest: bool=Fals
else:
manifest = {}
all_modules.append(Module(possible_module, possible_module_path, manifest))
for module in limit_to_modules:
if not any(module == m.name for m in all_modules):
logger.warning(f"Module {module} not found in available modules. Are you sure it's installed?")
return all_modules

Wyświetl plik

@ -11,9 +11,6 @@ from dataclasses import dataclass, field
from dataclasses_json import dataclass_json, config
import mimetypes
import ffmpeg
from ffmpeg._run import Error
from .context import ArchivingContext
from loguru import logger
@ -106,6 +103,12 @@ class Media:
return self.mimetype.startswith("image")
def is_valid_video(self) -> bool:
# Note: this is intentional, to only import ffmpeg here - when the method is called
# this speeds up loading the module. We check that 'ffmpeg' is available on startup
# when we load each manifest file
import ffmpeg
from ffmpeg._run import Error
# checks for video streams with ffmpeg, or min file size for a video
# self.is_video() should be used together with this method
try:

Wyświetl plik

@ -16,16 +16,10 @@ from rich_argparse import RichHelpFormatter
from .context import ArchivingContext
from ..archivers import Archiver
from ..feeders import Feeder
from ..formatters import Formatter
from ..storages import Storage
from ..enrichers import Enricher
from ..databases import Database
from .metadata import Metadata
from ..version import __version__
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG
from .loader import available_modules, Module, MODULE_TYPES
from .loader import available_modules, Module, MODULE_TYPES, load_modules
import tempfile, traceback
from loguru import logger
@ -74,7 +68,7 @@ class ArchivingOrchestrator:
add_help=False,
)
self.add_steps_args(parser)
breakpoint()
# check what mode we're in
# if we have a config file, use that to decide which modules to load
# if simple, we'll load just the modules that has requires_setup = False
@ -91,7 +85,7 @@ class ArchivingOrchestrator:
if modules := getattr(basic_config, f"{module_type}s", []):
enabled_modules.extend(modules)
self.add_module_args(available_modules(enabled_modules, with_manifest=True), parser)
self.add_module_args(available_modules(with_manifest=True, limit_to_modules=enabled_modules), parser)
elif basic_config.mode == 'simple':
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup]
self.add_module_args(simple_modules, parser)
@ -103,7 +97,7 @@ class ArchivingOrchestrator:
# load all modules, they're not using the 'simple' mode
self.add_module_args(available_modules(with_manifest=True), parser)
breakpoint()
parser.set_defaults(**to_dot_notation(yaml_config))
# reload the parser with the new arguments, now that we have them
@ -114,27 +108,30 @@ class ArchivingOrchestrator:
# merge the new config with the old one
yaml_config = merge_dicts(vars(parsed), yaml_config)
if self.config and basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)):
if basic_config.store or not os.path.isfile(join(dirname(__file__), basic_config.config_file)):
logger.info(f"Storing configuration file to {basic_config.config_file}")
store_yaml(yaml_config, basic_config.config_file)
breakpoint()
logger.info(f"FEEDER: {self.config.feeders}")
logger.info(f"ENRICHERS: {self.config.enrichers}")
logger.info(f"ARCHIVERS: {self.config.archivers}")
logger.info(f"DATABASES: {self.config.databases}")
logger.info(f"STORAGES: {self.config.storages}")
logger.info(f"FORMATTER: {self.formatter.name}")
self.config = yaml_config
logger.info("FEEDERS: " + ", ".join(self.config['steps']['feeders']))
logger.info("EXTRACTORS: " + ", ".join(self.config['steps']['extractors']))
logger.info("ENRICHERS: " + ", ".join(self.config['steps']['enrichers']))
logger.info("DATABASES: " + ", ".join(self.config['steps']['databases']))
logger.info("STORAGES: " + ", ".join(self.config['steps']['storages']))
logger.info("FORMATTERS: " + ", ".join(self.config['steps']['formatters']))
return self.config
def add_steps_args(self, parser: argparse.ArgumentParser = None):
if not parser:
parser = self.parser
parser.add_argument('--feeders', action='store', dest='feeders', nargs='+', required=True, help='the feeders to use')
parser.add_argument('--enrichers', action='store', dest='enrichers', nargs='+', required=True, help='the enrichers to use')
parser.add_argument('--archivers', action='store', dest='archivers', nargs='+', required=True, help='the archivers to use')
parser.add_argument('--databases', action='store', dest='databases', nargs='+', required=True, help='the databases to use')
parser.add_argument('--storages', action='store', dest='storages', nargs='+', required=True, help='the storages to use')
parser.add_argument('--formatter', action='store', dest='formatter', nargs='+', required=True, help='the formatter to use')
parser.add_argument('--feeders', action='store', dest='steps.feeders', nargs='+', required=True, help='the feeders to use')
parser.add_argument('--enrichers', action='store', dest='steps.enrichers', nargs='+', required=True, help='the enrichers to use')
parser.add_argument('--extractors', action='store', dest='steps.extractors', nargs='+', required=True, help='the extractors to use')
parser.add_argument('--databases', action='store', dest='steps.databases', nargs='+', required=True, help='the databases to use')
parser.add_argument('--storages', action='store', dest='steps.storages', nargs='+', required=True, help='the storages to use')
parser.add_argument('--formatters', action='store', dest='steps.formatters', nargs='+', required=True, help='the formatter to use')
def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None):
@ -165,6 +162,12 @@ class ArchivingOrchestrator:
self.basic_parser.print_help()
exit()
def install_modules(self):
modules = set()
[modules.update(*m) for m in self.config['steps'].values()]
load_modules(modules)
def run(self) -> None:
self.setup_basic_parser()
@ -187,11 +190,10 @@ class ArchivingOrchestrator:
yaml_config = read_yaml(basic_config.config_file)
breakpoint()
self.setup_complete_parser(basic_config, yaml_config, unused_args)
config.parse()
self.install_modules()
for item in self.feed():
pass
@ -201,8 +203,9 @@ class ArchivingOrchestrator:
for a in self.all_archivers_for_setup(): a.cleanup()
def feed(self) -> Generator[Metadata]:
for item in self.feeder:
yield self.feed_item(item)
for feeder in self.config['steps']['feeders']:
for item in feeder:
yield self.feed_item(item)
self.cleanup()
def feed_item(self, item: Metadata) -> Metadata:

Wyświetl plik

@ -1,10 +1,4 @@
""" Databases are used to store the outputs from running the Autp Archiver.
"""
from .database import Database
from .gsheet_db import GsheetsDb
from .console_db import ConsoleDb
from .csv_db import CSVDb
from .api_db import AAApiDb
from .atlos_db import AtlosDb
"""

Wyświetl plik

@ -10,15 +10,3 @@ Enrichers are optional but highly useful for making the archived data more power
"""
from .enricher import Enricher
from .screenshot_enricher import ScreenshotEnricher
from .wayback_enricher import WaybackArchiverEnricher
from .hash_enricher import HashEnricher
from .thumbnail_enricher import ThumbnailEnricher
from .wacz_enricher import WaczArchiverEnricher
from .whisper_enricher import WhisperEnricher
from .pdq_hash_enricher import PdqHashEnricher
from .metadata_enricher import MetadataEnricher
from .meta_enricher import MetaEnricher
from .ssl_enricher import SSLEnricher
from .timestamping_enricher import TimestampingEnricher

Wyświetl plik

@ -1,7 +1,3 @@
""" Feeders handle the input of media into the Auto Archiver.
"""
from.feeder import Feeder
from .gsheet_feeder import GsheetsFeeder
from .cli_feeder import CLIFeeder
from .atlos_feeder import AtlosFeeder

Wyświetl plik

@ -1,4 +1 @@
""" Formatters for the output of the content. """
from .formatter import Formatter
from .html_formatter import HtmlFormatter
from .mute_formatter import MuteFormatter

Wyświetl plik

@ -1,8 +1,3 @@
""" This module contains the storage classes for the auto-archiver.
"""
from .storage import Storage
from .s3 import S3Storage
from .local import LocalStorage
from .gd import GDriveStorage
from .atlos import AtlosStorage
"""