Merge pull request #210 from bellingcat/logger_fix

Fix issue #200 + Refactor _LAZY_LOADED_MODULES
pull/216/head v0.13.4
Patrick Robertson 2025-02-19 15:11:42 +00:00 zatwierdzone przez GitHub
commit 5211c5de18
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
15 zmienionych plików z 232 dodań i 187 usunięć

Wyświetl plik

@ -1,6 +1,6 @@
# iterate through all the modules in auto_archiver.modules and turn the __manifest__.py file into a markdown table # iterate through all the modules in auto_archiver.modules and turn the __manifest__.py file into a markdown table
from pathlib import Path from pathlib import Path
from auto_archiver.core.module import available_modules from auto_archiver.core.module import ModuleFactory
from auto_archiver.core.base_module import BaseModule from auto_archiver.core.base_module import BaseModule
from ruamel.yaml import YAML from ruamel.yaml import YAML
import io import io
@ -41,7 +41,7 @@ def generate_module_docs():
configs_cheatsheet = "\n## Configuration Options\n" configs_cheatsheet = "\n## Configuration Options\n"
configs_cheatsheet += header_row configs_cheatsheet += header_row
for module in sorted(available_modules(with_manifest=True), key=lambda x: (x.requires_setup, x.name)): for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)):
# generate the markdown file from the __manifest__.py file. # generate the markdown file from the __manifest__.py file.
manifest = module.manifest manifest = module.manifest

Wyświetl plik

@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[project] [project]
name = "auto-archiver" name = "auto-archiver"
version = "0.13.3" version = "0.13.4"
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
requires-python = ">=3.10,<3.13" requires-python = ">=3.10,<3.13"

Wyświetl plik

@ -3,7 +3,7 @@
""" """
from .metadata import Metadata from .metadata import Metadata
from .media import Media from .media import Media
from .module import BaseModule from .base_module import BaseModule
# cannot import ArchivingOrchestrator/Config to avoid circular dep # cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator # from .orchestrator import ArchivingOrchestrator

Wyświetl plik

@ -1,13 +1,18 @@
from urllib.parse import urlparse from __future__ import annotations
from typing import Mapping, Any
from typing import Mapping, Any, Type, TYPE_CHECKING
from abc import ABC from abc import ABC
from copy import deepcopy, copy from copy import deepcopy, copy
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from auto_archiver.utils import url as UrlUtil from auto_archiver.utils import url as UrlUtil
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
from loguru import logger from loguru import logger
if TYPE_CHECKING:
from .module import ModuleFactory
class BaseModule(ABC): class BaseModule(ABC):
""" """
@ -17,41 +22,24 @@ class BaseModule(ABC):
however modules can have a .setup() method to run any setup code however modules can have a .setup() method to run any setup code
(e.g. logging in to a site, spinning up a browser etc.) (e.g. logging in to a site, spinning up a browser etc.)
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that See consts.MODULE_TYPES for the types of modules you can create, noting that
a subclass can be of multiple types. For example, a module that extracts data from a subclass can be of multiple types. For example, a module that extracts data from
a website and stores it in a database would be both an 'extractor' and a 'database' module. a website and stores it in a database would be both an 'extractor' and a 'database' module.
Each module is a python package, and should have a __manifest__.py file in the Each module is a python package, and should have a __manifest__.py file in the
same directory as the module file. The __manifest__.py specifies the module information same directory as the module file. The __manifest__.py specifies the module information
like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the like name, author, version, dependencies etc. See DEFAULT_MANIFEST for the
default manifest structure. default manifest structure.
""" """
MODULE_TYPES = [ MODULE_TYPES = CONF_MODULE_TYPES
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
_DEFAULT_MANIFEST = {
'name': '', # the display name of the module
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
'description': '', # a description of the module
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
'version': '1.0', # the version of the module
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
}
# NOTE: these here are declard as class variables, but they are overridden by the instance variables in the __init__ method
config: Mapping[str, Any] config: Mapping[str, Any]
authentication: Mapping[str, Mapping[str, str]] authentication: Mapping[str, Mapping[str, str]]
name: str name: str
module_factory: ModuleFactory
# this is set by the orchestrator prior to archiving # this is set by the orchestrator prior to archiving
tmp_dir: TemporaryDirectory = None tmp_dir: TemporaryDirectory = None

Wyświetl plik

@ -11,7 +11,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer
from loguru import logger from loguru import logger
from copy import deepcopy from copy import deepcopy
from .module import BaseModule from auto_archiver.core.consts import MODULE_TYPES
from typing import Any, List, Type, Tuple from typing import Any, List, Type, Tuple
@ -21,7 +21,7 @@ EMPTY_CONFIG = _yaml.load("""
# Auto Archiver Configuration # Auto Archiver Configuration
# Steps are the modules that will be run in the order they are defined # Steps are the modules that will be run in the order they are defined
steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES]) + \ steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
""" """
# Global configuration # Global configuration

Wyświetl plik

@ -0,0 +1,23 @@
MODULE_TYPES = [
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
MANIFEST_FILE = "__manifest__.py"
DEFAULT_MANIFEST = {
'name': '', # the display name of the module
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
'type': [], # the type of the module, can be one or more of MODULE_TYPES
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
'description': '', # a description of the module
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
'version': '1.0', # the version of the module
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
}

Wyświetl plik

@ -6,7 +6,7 @@ by handling user configuration, validating the steps properties, and implementin
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from typing import List from typing import List, TYPE_CHECKING
import shutil import shutil
import ast import ast
import copy import copy
@ -16,20 +16,28 @@ import os
from os.path import join from os.path import join
from loguru import logger from loguru import logger
import auto_archiver import auto_archiver
from .base_module import BaseModule from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE
_LAZY_LOADED_MODULES = {} if TYPE_CHECKING:
from .base_module import BaseModule
MANIFEST_FILE = "__manifest__.py"
def setup_paths(paths: list[str]) -> None: HAS_SETUP_PATHS = False
class ModuleFactory:
def __init__(self):
self._lazy_modules = {}
def setup_paths(self, paths: list[str]) -> None:
""" """
Sets up the paths for the modules to be loaded from Sets up the paths for the modules to be loaded from
This is necessary for the modules to be imported correctly This is necessary for the modules to be imported correctly
""" """
global HAS_SETUP_PATHS
for path in paths: for path in paths:
# check path exists, if it doesn't, log a warning # check path exists, if it doesn't, log a warning
if not os.path.exists(path): if not os.path.exists(path):
@ -38,21 +46,27 @@ def setup_paths(paths: list[str]) -> None:
# see odoo/module/module.py -> initialize_sys_path # see odoo/module/module.py -> initialize_sys_path
if path not in auto_archiver.modules.__path__: if path not in auto_archiver.modules.__path__:
if HAS_SETUP_PATHS == True:
logger.warning(f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
This could lead to unexpected behaviour. It is recommended to only use a single modules path. \
If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing).")
auto_archiver.modules.__path__.append(path) auto_archiver.modules.__path__.append(path)
# sort based on the length of the path, so that the longest path is last in the list # sort based on the length of the path, so that the longest path is last in the list
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True) auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
def get_module(module_name: str, config: dict) -> BaseModule: HAS_SETUP_PATHS = True
def get_module(self, module_name: str, config: dict) -> BaseModule:
""" """
Gets and sets up a module using the provided config Gets and sets up a module using the provided config
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy) This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
""" """
return get_module_lazy(module_name).load(config) return self.get_module_lazy(module_name).load(config)
def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule: def get_module_lazy(self, module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
""" """
Lazily loads a module, returning a LazyBaseModule Lazily loads a module, returning a LazyBaseModule
@ -61,15 +75,15 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa
To load an actual module, call .setup() on a lazy module To load an actual module, call .setup() on a lazy module
""" """
if module_name in _LAZY_LOADED_MODULES: if module_name in self._lazy_modules:
return _LAZY_LOADED_MODULES[module_name] return self._lazy_modules[module_name]
available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings) available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
if not available: if not available:
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?") raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
return available[0] return available[0]
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]: def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory # search through all valid 'modules' paths. Default is 'modules' in the current directory
@ -95,11 +109,11 @@ def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= []
possible_module_path = join(module_folder, possible_module) possible_module_path = join(module_folder, possible_module)
if not is_really_module(possible_module_path): if not is_really_module(possible_module_path):
continue continue
if _LAZY_LOADED_MODULES.get(possible_module): if self._lazy_modules.get(possible_module):
continue continue
lazy_module = LazyBaseModule(possible_module, possible_module_path) lazy_module = LazyBaseModule(possible_module, possible_module_path, factory=self)
_LAZY_LOADED_MODULES[possible_module] = lazy_module self._lazy_modules[possible_module] = lazy_module
all_modules.append(lazy_module) all_modules.append(lazy_module)
@ -123,14 +137,16 @@ class LazyBaseModule:
type: list type: list
description: str description: str
path: str path: str
module_factory: ModuleFactory
_manifest: dict = None _manifest: dict = None
_instance: BaseModule = None _instance: BaseModule = None
_entry_point: str = None _entry_point: str = None
def __init__(self, module_name, path): def __init__(self, module_name, path, factory: ModuleFactory):
self.name = module_name self.name = module_name
self.path = path self.path = path
self.module_factory = factory
@property @property
def entry_point(self): def entry_point(self):
@ -161,7 +177,7 @@ class LazyBaseModule:
return self._manifest return self._manifest
# print(f"Loading manifest for module {module_path}") # print(f"Loading manifest for module {module_path}")
# load the manifest file # load the manifest file
manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST) manifest = copy.deepcopy(DEFAULT_MANIFEST)
with open(join(self.path, MANIFEST_FILE)) as f: with open(join(self.path, MANIFEST_FILE)) as f:
try: try:
@ -189,13 +205,14 @@ class LazyBaseModule:
# clear out any empty strings that a user may have erroneously added # clear out any empty strings that a user may have erroneously added
continue continue
if not check(dep): if not check(dep):
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
exit(1) exit(1)
def check_python_dep(dep): def check_python_dep(dep):
# first check if it's a module: # first check if it's a module:
try: try:
m = get_module_lazy(dep, suppress_warnings=True) m = self.module_factory.get_module_lazy(dep, suppress_warnings=True)
try: try:
# we must now load this module and set it up with the config # we must now load this module and set it up with the config
m.load(config) m.load(config)
@ -230,19 +247,21 @@ class LazyBaseModule:
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point]) __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
# finally, get the class instance # finally, get the class instance
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)() instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
if not getattr(instance, 'name', None):
# set the name, display name and module factory
instance.name = self.name instance.name = self.name
if not getattr(instance, 'display_name', None):
instance.display_name = self.display_name instance.display_name = self.display_name
instance.module_factory = self.module_factory
self._instance = instance
# merge the default config with the user config # merge the default config with the user config
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default')) default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
config[self.name] = default_config | config.get(self.name, {}) config[self.name] = default_config | config.get(self.name, {})
instance.config_setup(config) instance.config_setup(config)
instance.setup() instance.setup()
# save the instance for future easy loading
self._instance = instance
return instance return instance
def __repr__(self): def __repr__(self):

Wyświetl plik

@ -5,7 +5,7 @@
""" """
from __future__ import annotations from __future__ import annotations
from typing import Generator, Union, List, Type from typing import Generator, Union, List, Type, TYPE_CHECKING
from urllib.parse import urlparse from urllib.parse import urlparse
from ipaddress import ip_address from ipaddress import ip_address
from copy import copy from copy import copy
@ -22,12 +22,14 @@ from rich_argparse import RichHelpFormatter
from .metadata import Metadata, Media from .metadata import Metadata, Media
from auto_archiver.version import __version__ from auto_archiver.version import __version__
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
from .module import available_modules, LazyBaseModule, get_module, setup_paths from .module import ModuleFactory, LazyBaseModule
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
from .module import BaseModule from .consts import MODULE_TYPES
from loguru import logger from loguru import logger
if TYPE_CHECKING:
from .base_module import BaseModule
from .module import LazyBaseModule
DEFAULT_CONFIG_FILE = "orchestration.yaml" DEFAULT_CONFIG_FILE = "orchestration.yaml"
@ -95,6 +97,12 @@ class UniqueAppendAction(argparse.Action):
class ArchivingOrchestrator: class ArchivingOrchestrator:
# instance variables
module_factory: ModuleFactory
setup_finished: bool
logger_id: int
# instance variables, used for convenience to access modules by step
feeders: List[Type[Feeder]] feeders: List[Type[Feeder]]
extractors: List[Type[Extractor]] extractors: List[Type[Extractor]]
enrichers: List[Type[Enricher]] enrichers: List[Type[Enricher]]
@ -102,6 +110,11 @@ class ArchivingOrchestrator:
storages: List[Type[Storage]] storages: List[Type[Storage]]
formatters: List[Type[Formatter]] formatters: List[Type[Formatter]]
def __init__(self):
self.module_factory = ModuleFactory()
self.setup_finished = False
self.logger_id = None
def setup_basic_parser(self): def setup_basic_parser(self):
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog="auto-archiver", prog="auto-archiver",
@ -133,7 +146,7 @@ class ArchivingOrchestrator:
) )
self.add_modules_args(modules_parser) self.add_modules_args(modules_parser)
cli_modules, unused_args = modules_parser.parse_known_args(unused_args) cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
for module_type in BaseModule.MODULE_TYPES: for module_type in MODULE_TYPES:
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", []) yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
parser = DefaultValidatingParser( parser = DefaultValidatingParser(
@ -155,15 +168,15 @@ class ArchivingOrchestrator:
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
enabled_modules = [] enabled_modules = []
# first loads the modules from the config file, then from the command line # first loads the modules from the config file, then from the command line
for module_type in BaseModule.MODULE_TYPES: for module_type in MODULE_TYPES:
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", [])) enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
# clear out duplicates, but keep the order # clear out duplicates, but keep the order
enabled_modules = list(dict.fromkeys(enabled_modules)) enabled_modules = list(dict.fromkeys(enabled_modules))
avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True) avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
self.add_individual_module_args(avail_modules, parser) self.add_individual_module_args(avail_modules, parser)
elif basic_config.mode == 'simple': elif basic_config.mode == 'simple':
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
self.add_individual_module_args(simple_modules, parser) self.add_individual_module_args(simple_modules, parser)
# for simple mode, we use the cli_feeder and any modules that don't require setup # for simple mode, we use the cli_feeder and any modules that don't require setup
@ -176,7 +189,7 @@ class ArchivingOrchestrator:
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name) yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
else: else:
# load all modules, they're not using the 'simple' mode # load all modules, they're not using the 'simple' mode
self.add_individual_module_args(available_modules(with_manifest=True), parser) self.add_individual_module_args(self.module_factory.available_modules(), parser)
parser.set_defaults(**to_dot_notation(yaml_config)) parser.set_defaults(**to_dot_notation(yaml_config))
@ -206,7 +219,7 @@ class ArchivingOrchestrator:
parser = self.parser parser = self.parser
# Module loading from the command line # Module loading from the command line
for module_type in BaseModule.MODULE_TYPES: for module_type in MODULE_TYPES:
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction) parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
def add_additional_args(self, parser: argparse.ArgumentParser = None): def add_additional_args(self, parser: argparse.ArgumentParser = None):
@ -232,7 +245,7 @@ class ArchivingOrchestrator:
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None: def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
if not modules: if not modules:
modules = available_modules(with_manifest=True) modules = self.module_factory.available_modules()
for module in modules: for module in modules:
@ -274,9 +287,16 @@ class ArchivingOrchestrator:
def setup_logging(self, config): def setup_logging(self, config):
# setup loguru logging # setup loguru logging
try:
logger.remove(0) # remove the default logger logger.remove(0) # remove the default logger
except ValueError:
pass
logging_config = config['logging'] logging_config = config['logging']
logger.add(sys.stderr, level=logging_config['level'])
# add other logging info
if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
if log_file := logging_config['file']: if log_file := logging_config['file']:
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
@ -288,7 +308,7 @@ class ArchivingOrchestrator:
""" """
invalid_modules = [] invalid_modules = []
for module_type in BaseModule.MODULE_TYPES: for module_type in MODULE_TYPES:
step_items = [] step_items = []
modules_to_load = modules_by_type[f"{module_type}s"] modules_to_load = modules_by_type[f"{module_type}s"]
@ -333,7 +353,7 @@ class ArchivingOrchestrator:
if module in invalid_modules: if module in invalid_modules:
continue continue
try: try:
loaded_module: BaseModule = get_module(module, self.config) loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
except (KeyboardInterrupt, Exception) as e: except (KeyboardInterrupt, Exception) as e:
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}") logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if module_type == 'extractor' and loaded_module.name == module: if module_type == 'extractor' and loaded_module.name == module:
@ -359,14 +379,17 @@ class ArchivingOrchestrator:
def setup_config(self, args: list) -> dict: def setup_config(self, args: list) -> dict:
""" """
Sets up the configuration file, merging the default config with the user's config Sets up the configuration file, merging the default config with the user's config
This function should only ever be run once.
""" """
self.setup_basic_parser() self.setup_basic_parser()
# parse the known arguments for now (basically, we want the config file) # parse the known arguments for now (basically, we want the config file)
basic_config, unused_args = self.basic_parser.parse_known_args(args) basic_config, unused_args = self.basic_parser.parse_known_args(args)
# setup any custom module paths, so they'll show in the help and for arg parsing # setup any custom module paths, so they'll show in the help and for arg parsing
setup_paths(basic_config.module_paths) self.module_factory.setup_paths(basic_config.module_paths)
# if help flag was called, then show the help # if help flag was called, then show the help
if basic_config.help: if basic_config.help:
@ -378,17 +401,30 @@ class ArchivingOrchestrator:
def setup(self, args: list): def setup(self, args: list):
""" """
Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser Function to configure all setup of the orchestrator: setup configs and load modules.
This method should only ever be called once
""" """
if self.setup_finished:
logger.warning("The `setup_config()` function should only ever be run once. \
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
For code implementatations, you should call .setup_config() once then you may call .feed() \
multiple times to archive multiple URLs.")
return
self.setup_basic_parser()
self.config = self.setup_config(args) self.config = self.setup_config(args)
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
self.install_modules(self.config['steps']) self.install_modules(self.config['steps'])
# log out the modules that were loaded # log out the modules that were loaded
for module_type in BaseModule.MODULE_TYPES: for module_type in MODULE_TYPES:
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))) logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
self.setup_finished = True
def _command_line_run(self, args: list) -> Generator[Metadata]: def _command_line_run(self, args: list) -> Generator[Metadata]:
""" """
This is the main entry point for the orchestrator, when run from the command line. This is the main entry point for the orchestrator, when run from the command line.

Wyświetl plik

@ -14,7 +14,7 @@ from auto_archiver.utils.misc import random_str
from auto_archiver.core import Media, BaseModule, Metadata from auto_archiver.core import Media, BaseModule, Metadata
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
from auto_archiver.core.module import get_module
class Storage(BaseModule): class Storage(BaseModule):
""" """
@ -74,7 +74,7 @@ class Storage(BaseModule):
filename = random_str(24) filename = random_str(24)
elif filename_generator == "static": elif filename_generator == "static":
# load the hash_enricher module # load the hash_enricher module
he = get_module(HashEnricher, self.config) he = self.module_factory.get_module(HashEnricher, self.config)
hd = he.calculate_hash(media.filename) hd = he.calculate_hash(media.filename)
filename = hd[:24] filename = hd[:24]
else: else:

Wyświetl plik

@ -10,7 +10,6 @@ from auto_archiver.version import __version__
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media
from auto_archiver.core import Formatter from auto_archiver.core import Formatter
from auto_archiver.utils.misc import random_str from auto_archiver.utils.misc import random_str
from auto_archiver.core.module import get_module
class HtmlFormatter(Formatter): class HtmlFormatter(Formatter):
environment: Environment = None environment: Environment = None
@ -50,7 +49,7 @@ class HtmlFormatter(Formatter):
final_media = Media(filename=html_path, _mimetype="text/html") final_media = Media(filename=html_path, _mimetype="text/html")
# get the already instantiated hash_enricher module # get the already instantiated hash_enricher module
he = get_module('hash_enricher', self.config) he = self.module_factory.get_module('hash_enricher', self.config)
if len(hd := he.calculate_hash(final_media.filename)): if len(hd := he.calculate_hash(final_media.filename)):
final_media.set("hash", f"{he.algorithm}:{hd}") final_media.set("hash", f"{he.algorithm}:{hd}")

Wyświetl plik

@ -4,7 +4,6 @@ from loguru import logger
from auto_archiver.core import Enricher from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media
from auto_archiver.core.module import get_module
class WhisperEnricher(Enricher): class WhisperEnricher(Enricher):
""" """
@ -15,7 +14,7 @@ class WhisperEnricher(Enricher):
def setup(self) -> None: def setup(self) -> None:
self.stores = self.config['steps']['storages'] self.stores = self.config['steps']['storages']
self.s3 = get_module("s3_storage", self.config) self.s3 = self.module_factory.get_module("s3_storage", self.config)
if not "s3_storage" in self.stores: if not "s3_storage" in self.stores:
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.") logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
return return

Wyświetl plik

@ -10,7 +10,7 @@ import hashlib
import pytest import pytest
from auto_archiver.core.metadata import Metadata from auto_archiver.core.metadata import Metadata
from auto_archiver.core.module import get_module, _LAZY_LOADED_MODULES from auto_archiver.core.module import ModuleFactory
# Test names inserted into this list will be run last. This is useful for expensive/costly tests # Test names inserted into this list will be run last. This is useful for expensive/costly tests
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important # that you only want to run if everything else succeeds (e.g. API calls). The order here is important
@ -22,19 +22,19 @@ TESTS_TO_RUN_LAST = ['test_twitter_api_archiver']
def setup_module(request): def setup_module(request):
def _setup_module(module_name, config={}): def _setup_module(module_name, config={}):
module_factory = ModuleFactory()
if isinstance(module_name, type): if isinstance(module_name, type):
# get the module name: # get the module name:
# if the class does not have a .name, use the name of the parent folder # if the class does not have a .name, use the name of the parent folder
module_name = module_name.__module__.rsplit(".",2)[-2] module_name = module_name.__module__.rsplit(".",2)[-2]
m = get_module(module_name, {module_name: config}) m = module_factory.get_module(module_name, {module_name: config})
# add the tmp_dir to the module # add the tmp_dir to the module
tmp_dir = TemporaryDirectory() tmp_dir = TemporaryDirectory()
m.tmp_dir = tmp_dir.name m.tmp_dir = tmp_dir.name
def cleanup(): def cleanup():
_LAZY_LOADED_MODULES.pop(module_name)
tmp_dir.cleanup() tmp_dir.cleanup()
request.addfinalizer(cleanup) request.addfinalizer(cleanup)

Wyświetl plik

@ -2,7 +2,7 @@ import pytest
from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media
from auto_archiver.core.module import get_module_lazy from auto_archiver.core.module import ModuleFactory
@pytest.mark.parametrize("algorithm, filename, expected_hash", [ @pytest.mark.parametrize("algorithm, filename, expected_hash", [
("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"), ("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
@ -22,7 +22,7 @@ def test_default_config_values(setup_module):
def test_config(): def test_config():
# test default config # test default config
c = get_module_lazy('hash_enricher').configs c = ModuleFactory().get_module_lazy('hash_enricher').configs
assert c["algorithm"]["default"] == "SHA-256" assert c["algorithm"]["default"] == "SHA-256"
assert c["chunksize"]["default"] == 16000000 assert c["chunksize"]["default"] == 16000000
assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"] assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"]

Wyświetl plik

@ -1,24 +1,18 @@
import sys import sys
import pytest import pytest
from auto_archiver.core.module import get_module_lazy, BaseModule, LazyBaseModule, _LAZY_LOADED_MODULES from auto_archiver.core.module import ModuleFactory, LazyBaseModule
from auto_archiver.core.base_module import BaseModule
@pytest.fixture @pytest.fixture
def example_module(): def example_module():
import auto_archiver import auto_archiver
module_factory = ModuleFactory()
previous_path = auto_archiver.modules.__path__ previous_path = auto_archiver.modules.__path__
auto_archiver.modules.__path__.append("tests/data/test_modules/") auto_archiver.modules.__path__.append("tests/data/test_modules/")
module = get_module_lazy("example_module") return module_factory.get_module_lazy("example_module")
yield module
# cleanup
try:
del module._manifest
except AttributeError:
pass
del _LAZY_LOADED_MODULES["example_module"]
sys.modules.pop("auto_archiver.modules.example_module.example_module", None)
auto_archiver.modules.__path__ = previous_path
def test_get_module_lazy(example_module): def test_get_module_lazy(example_module):
assert example_module.name == "example_module" assert example_module.name == "example_module"
@ -46,12 +40,14 @@ def test_module_dependency_check_loads_module(example_module):
# monkey patch the manifest to include a nonexistnet dependency # monkey patch the manifest to include a nonexistnet dependency
example_module.manifest["dependencies"]["python"] = ["hash_enricher"] example_module.manifest["dependencies"]["python"] = ["hash_enricher"]
module_factory = example_module.module_factory
loaded_module = example_module.load({}) loaded_module = example_module.load({})
assert loaded_module is not None assert loaded_module is not None
# check the dependency is loaded # check the dependency is loaded
assert _LAZY_LOADED_MODULES["hash_enricher"] is not None assert module_factory._lazy_modules["hash_enricher"] is not None
assert _LAZY_LOADED_MODULES["hash_enricher"]._instance is not None assert module_factory._lazy_modules["hash_enricher"]._instance is not None
def test_load_module(example_module): def test_load_module(example_module):
@ -69,7 +65,7 @@ def test_load_module(example_module):
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"]) @pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
def test_load_modules(module_name): def test_load_modules(module_name):
# test that specific modules can be loaded # test that specific modules can be loaded
module = get_module_lazy(module_name) module = ModuleFactory().get_module_lazy(module_name)
assert module is not None assert module is not None
assert isinstance(module, LazyBaseModule) assert isinstance(module, LazyBaseModule)
assert module.name == module_name assert module.name == module_name
@ -86,7 +82,7 @@ def test_load_modules(module_name):
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"]) @pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
def test_lazy_base_module(module_name): def test_lazy_base_module(module_name):
lazy_module = get_module_lazy(module_name) lazy_module = ModuleFactory().get_module_lazy(module_name)
assert lazy_module is not None assert lazy_module is not None
assert isinstance(lazy_module, LazyBaseModule) assert isinstance(lazy_module, LazyBaseModule)

Wyświetl plik

@ -4,7 +4,7 @@ from argparse import ArgumentParser, ArgumentTypeError
from auto_archiver.core.orchestrator import ArchivingOrchestrator from auto_archiver.core.orchestrator import ArchivingOrchestrator
from auto_archiver.version import __version__ from auto_archiver.version import __version__
from auto_archiver.core.config import read_yaml, store_yaml from auto_archiver.core.config import read_yaml, store_yaml
from auto_archiver.core.module import _LAZY_LOADED_MODULES
TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml" TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
TEST_MODULES = "tests/data/test_modules/" TEST_MODULES = "tests/data/test_modules/"
@ -17,22 +17,7 @@ def test_args():
@pytest.fixture @pytest.fixture
def orchestrator(): def orchestrator():
yield ArchivingOrchestrator() return ArchivingOrchestrator()
# hack - the loguru logger starts with one logger, but if orchestrator has run before
# it'll remove the default logger, add it back in:
from loguru import logger
if not logger._core.handlers.get(0):
logger._core.handlers_count = 0
logger.add(sys.stderr)
# and remove the custom logger
if logger._core.handlers.get(1):
logger.remove(1)
# delete out any loaded modules
_LAZY_LOADED_MODULES.clear()
@pytest.fixture @pytest.fixture
def basic_parser(orchestrator) -> ArgumentParser: def basic_parser(orchestrator) -> ArgumentParser: