Merge pull request #210 from bellingcat/logger_fix

Fix issue #200 + Refactor _LAZY_LOADED_MODULES
pull/216/head v0.13.4
Patrick Robertson 2025-02-19 15:11:42 +00:00 zatwierdzone przez GitHub
commit 5211c5de18
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
15 zmienionych plików z 232 dodań i 187 usunięć

Wyświetl plik

@ -1,6 +1,6 @@
# iterate through all the modules in auto_archiver.modules and turn the __manifest__.py file into a markdown table # iterate through all the modules in auto_archiver.modules and turn the __manifest__.py file into a markdown table
from pathlib import Path from pathlib import Path
from auto_archiver.core.module import available_modules from auto_archiver.core.module import ModuleFactory
from auto_archiver.core.base_module import BaseModule from auto_archiver.core.base_module import BaseModule
from ruamel.yaml import YAML from ruamel.yaml import YAML
import io import io
@ -41,7 +41,7 @@ def generate_module_docs():
configs_cheatsheet = "\n## Configuration Options\n" configs_cheatsheet = "\n## Configuration Options\n"
configs_cheatsheet += header_row configs_cheatsheet += header_row
for module in sorted(available_modules(with_manifest=True), key=lambda x: (x.requires_setup, x.name)): for module in sorted(ModuleFactory().available_modules(), key=lambda x: (x.requires_setup, x.name)):
# generate the markdown file from the __manifest__.py file. # generate the markdown file from the __manifest__.py file.
manifest = module.manifest manifest = module.manifest

Wyświetl plik

@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
[project] [project]
name = "auto-archiver" name = "auto-archiver"
version = "0.13.3" version = "0.13.4"
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)." description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
requires-python = ">=3.10,<3.13" requires-python = ">=3.10,<3.13"

Wyświetl plik

@ -3,7 +3,7 @@
""" """
from .metadata import Metadata from .metadata import Metadata
from .media import Media from .media import Media
from .module import BaseModule from .base_module import BaseModule
# cannot import ArchivingOrchestrator/Config to avoid circular dep # cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator # from .orchestrator import ArchivingOrchestrator

Wyświetl plik

@ -1,13 +1,18 @@
from urllib.parse import urlparse from __future__ import annotations
from typing import Mapping, Any
from typing import Mapping, Any, Type, TYPE_CHECKING
from abc import ABC from abc import ABC
from copy import deepcopy, copy from copy import deepcopy, copy
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from auto_archiver.utils import url as UrlUtil from auto_archiver.utils import url as UrlUtil
from auto_archiver.core.consts import MODULE_TYPES as CONF_MODULE_TYPES
from loguru import logger from loguru import logger
if TYPE_CHECKING:
from .module import ModuleFactory
class BaseModule(ABC): class BaseModule(ABC):
""" """
@ -17,41 +22,24 @@ class BaseModule(ABC):
however modules can have a .setup() method to run any setup code however modules can have a .setup() method to run any setup code
(e.g. logging in to a site, spinning up a browser etc.) (e.g. logging in to a site, spinning up a browser etc.)
See BaseModule.MODULE_TYPES for the types of modules you can create, noting that See consts.MODULE_TYPES for the types of modules you can create, noting that
a subclass can be of multiple types. For example, a module that extracts data from a subclass can be of multiple types. For example, a module that extracts data from
a website and stores it in a database would be both an 'extractor' and a 'database' module. a website and stores it in a database would be both an 'extractor' and a 'database' module.
Each module is a python package, and should have a __manifest__.py file in the Each module is a python package, and should have a __manifest__.py file in the
same directory as the module file. The __manifest__.py specifies the module information same directory as the module file. The __manifest__.py specifies the module information
like name, author, version, dependencies etc. See BaseModule._DEFAULT_MANIFEST for the like name, author, version, dependencies etc. See DEFAULT_MANIFEST for the
default manifest structure. default manifest structure.
""" """
MODULE_TYPES = [ MODULE_TYPES = CONF_MODULE_TYPES
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
_DEFAULT_MANIFEST = {
'name': '', # the display name of the module
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
'type': [], # the type of the module, can be one or more of BaseModule.MODULE_TYPES
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
'description': '', # a description of the module
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
'version': '1.0', # the version of the module
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
}
# NOTE: these here are declard as class variables, but they are overridden by the instance variables in the __init__ method
config: Mapping[str, Any] config: Mapping[str, Any]
authentication: Mapping[str, Mapping[str, str]] authentication: Mapping[str, Mapping[str, str]]
name: str name: str
module_factory: ModuleFactory
# this is set by the orchestrator prior to archiving # this is set by the orchestrator prior to archiving
tmp_dir: TemporaryDirectory = None tmp_dir: TemporaryDirectory = None

Wyświetl plik

@ -11,7 +11,7 @@ from ruamel.yaml import YAML, CommentedMap, add_representer
from loguru import logger from loguru import logger
from copy import deepcopy from copy import deepcopy
from .module import BaseModule from auto_archiver.core.consts import MODULE_TYPES
from typing import Any, List, Type, Tuple from typing import Any, List, Type, Tuple
@ -21,7 +21,7 @@ EMPTY_CONFIG = _yaml.load("""
# Auto Archiver Configuration # Auto Archiver Configuration
# Steps are the modules that will be run in the order they are defined # Steps are the modules that will be run in the order they are defined
steps:""" + "".join([f"\n {module}s: []" for module in BaseModule.MODULE_TYPES]) + \ steps:""" + "".join([f"\n {module}s: []" for module in MODULE_TYPES]) + \
""" """
# Global configuration # Global configuration

Wyświetl plik

@ -0,0 +1,23 @@
MODULE_TYPES = [
'feeder',
'extractor',
'enricher',
'database',
'storage',
'formatter'
]
MANIFEST_FILE = "__manifest__.py"
DEFAULT_MANIFEST = {
'name': '', # the display name of the module
'author': 'Bellingcat', # creator of the module, leave this as Bellingcat or set your own name!
'type': [], # the type of the module, can be one or more of MODULE_TYPES
'requires_setup': True, # whether or not this module requires additional setup such as setting API Keys or installing additional softare
'description': '', # a description of the module
'dependencies': {}, # external dependencies, e.g. python packages or binaries, in dictionary format
'entry_point': '', # the entry point for the module, in the format 'module_name::ClassName'. This can be left blank to use the default entry point of module_name::ModuleName
'version': '1.0', # the version of the module
'configs': {} # any configuration options this module has, these will be exposed to the user in the config file or via the command line
}

Wyświetl plik

@ -6,7 +6,7 @@ by handling user configuration, validating the steps properties, and implementin
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from typing import List from typing import List, TYPE_CHECKING
import shutil import shutil
import ast import ast
import copy import copy
@ -16,99 +16,113 @@ import os
from os.path import join from os.path import join
from loguru import logger from loguru import logger
import auto_archiver import auto_archiver
from .base_module import BaseModule from auto_archiver.core.consts import DEFAULT_MANIFEST, MANIFEST_FILE
_LAZY_LOADED_MODULES = {} if TYPE_CHECKING:
from .base_module import BaseModule
MANIFEST_FILE = "__manifest__.py"
def setup_paths(paths: list[str]) -> None: HAS_SETUP_PATHS = False
"""
Sets up the paths for the modules to be loaded from
This is necessary for the modules to be imported correctly
"""
for path in paths:
# check path exists, if it doesn't, log a warning
if not os.path.exists(path):
logger.warning(f"Path '{path}' does not exist. Skipping...")
continue
# see odoo/module/module.py -> initialize_sys_path class ModuleFactory:
if path not in auto_archiver.modules.__path__:
auto_archiver.modules.__path__.append(path)
# sort based on the length of the path, so that the longest path is last in the list def __init__(self):
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True) self._lazy_modules = {}
def get_module(module_name: str, config: dict) -> BaseModule: def setup_paths(self, paths: list[str]) -> None:
""" """
Gets and sets up a module using the provided config Sets up the paths for the modules to be loaded from
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy) This is necessary for the modules to be imported correctly
""" """
return get_module_lazy(module_name).load(config) global HAS_SETUP_PATHS
def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBaseModule: for path in paths:
""" # check path exists, if it doesn't, log a warning
Lazily loads a module, returning a LazyBaseModule if not os.path.exists(path):
logger.warning(f"Path '{path}' does not exist. Skipping...")
This has all the information about the module, but does not load the module itself or its dependencies
To load an actual module, call .setup() on a lazy module
"""
if module_name in _LAZY_LOADED_MODULES:
return _LAZY_LOADED_MODULES[module_name]
available = available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
if not available:
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
return available[0]
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
def is_really_module(module_path):
if os.path.isfile(join(module_path, MANIFEST_FILE)):
return True
all_modules = []
for module_folder in auto_archiver.modules.__path__:
# walk through each module in module_folder and check if it has a valid manifest
try:
possible_modules = os.listdir(module_folder)
except FileNotFoundError:
logger.warning(f"Module folder {module_folder} does not exist")
continue
for possible_module in possible_modules:
if limit_to_modules and possible_module not in limit_to_modules:
continue continue
possible_module_path = join(module_folder, possible_module) # see odoo/module/module.py -> initialize_sys_path
if not is_really_module(possible_module_path): if path not in auto_archiver.modules.__path__:
if HAS_SETUP_PATHS == True:
logger.warning(f"You are attempting to re-initialise the module paths with: '{path}' for a 2nd time. \
This could lead to unexpected behaviour. It is recommended to only use a single modules path. \
If you wish to load modules from different paths then load a 2nd python interpreter (e.g. using multiprocessing).")
auto_archiver.modules.__path__.append(path)
# sort based on the length of the path, so that the longest path is last in the list
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
HAS_SETUP_PATHS = True
def get_module(self, module_name: str, config: dict) -> BaseModule:
"""
Gets and sets up a module using the provided config
This will actually load and instantiate the module, and load all its dependencies (i.e. not lazy)
"""
return self.get_module_lazy(module_name).load(config)
def get_module_lazy(self, module_name: str, suppress_warnings: bool = False) -> LazyBaseModule:
"""
Lazily loads a module, returning a LazyBaseModule
This has all the information about the module, but does not load the module itself or its dependencies
To load an actual module, call .setup() on a lazy module
"""
if module_name in self._lazy_modules:
return self._lazy_modules[module_name]
available = self.available_modules(limit_to_modules=[module_name], suppress_warnings=suppress_warnings)
if not available:
raise IndexError(f"Module '{module_name}' not found. Are you sure it's installed/exists?")
return available[0]
def available_modules(self, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules
def is_really_module(module_path):
if os.path.isfile(join(module_path, MANIFEST_FILE)):
return True
all_modules = []
for module_folder in auto_archiver.modules.__path__:
# walk through each module in module_folder and check if it has a valid manifest
try:
possible_modules = os.listdir(module_folder)
except FileNotFoundError:
logger.warning(f"Module folder {module_folder} does not exist")
continue continue
if _LAZY_LOADED_MODULES.get(possible_module):
continue
lazy_module = LazyBaseModule(possible_module, possible_module_path)
_LAZY_LOADED_MODULES[possible_module] = lazy_module for possible_module in possible_modules:
if limit_to_modules and possible_module not in limit_to_modules:
continue
all_modules.append(lazy_module) possible_module_path = join(module_folder, possible_module)
if not is_really_module(possible_module_path):
if not suppress_warnings: continue
for module in limit_to_modules: if self._lazy_modules.get(possible_module):
if not any(module == m.name for m in all_modules): continue
logger.warning(f"Module '{module}' not found. Are you sure it's installed?") lazy_module = LazyBaseModule(possible_module, possible_module_path, factory=self)
return all_modules self._lazy_modules[possible_module] = lazy_module
all_modules.append(lazy_module)
if not suppress_warnings:
for module in limit_to_modules:
if not any(module == m.name for m in all_modules):
logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
return all_modules
@dataclass @dataclass
class LazyBaseModule: class LazyBaseModule:
@ -123,14 +137,16 @@ class LazyBaseModule:
type: list type: list
description: str description: str
path: str path: str
module_factory: ModuleFactory
_manifest: dict = None _manifest: dict = None
_instance: BaseModule = None _instance: BaseModule = None
_entry_point: str = None _entry_point: str = None
def __init__(self, module_name, path): def __init__(self, module_name, path, factory: ModuleFactory):
self.name = module_name self.name = module_name
self.path = path self.path = path
self.module_factory = factory
@property @property
def entry_point(self): def entry_point(self):
@ -161,7 +177,7 @@ class LazyBaseModule:
return self._manifest return self._manifest
# print(f"Loading manifest for module {module_path}") # print(f"Loading manifest for module {module_path}")
# load the manifest file # load the manifest file
manifest = copy.deepcopy(BaseModule._DEFAULT_MANIFEST) manifest = copy.deepcopy(DEFAULT_MANIFEST)
with open(join(self.path, MANIFEST_FILE)) as f: with open(join(self.path, MANIFEST_FILE)) as f:
try: try:
@ -189,13 +205,14 @@ class LazyBaseModule:
# clear out any empty strings that a user may have erroneously added # clear out any empty strings that a user may have erroneously added
continue continue
if not check(dep): if not check(dep):
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.") logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available/setup. \
Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
exit(1) exit(1)
def check_python_dep(dep): def check_python_dep(dep):
# first check if it's a module: # first check if it's a module:
try: try:
m = get_module_lazy(dep, suppress_warnings=True) m = self.module_factory.get_module_lazy(dep, suppress_warnings=True)
try: try:
# we must now load this module and set it up with the config # we must now load this module and set it up with the config
m.load(config) m.load(config)
@ -230,19 +247,21 @@ class LazyBaseModule:
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point]) __import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
# finally, get the class instance # finally, get the class instance
instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)() instance: BaseModule = getattr(sys.modules[sub_qualname], class_name)()
if not getattr(instance, 'name', None):
instance.name = self.name
if not getattr(instance, 'display_name', None):
instance.display_name = self.display_name
self._instance = instance
# set the name, display name and module factory
instance.name = self.name
instance.display_name = self.display_name
instance.module_factory = self.module_factory
# merge the default config with the user config # merge the default config with the user config
default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default')) default_config = dict((k, v['default']) for k, v in self.configs.items() if v.get('default'))
config[self.name] = default_config | config.get(self.name, {}) config[self.name] = default_config | config.get(self.name, {})
instance.config_setup(config) instance.config_setup(config)
instance.setup() instance.setup()
# save the instance for future easy loading
self._instance = instance
return instance return instance
def __repr__(self): def __repr__(self):

Wyświetl plik

@ -5,7 +5,7 @@
""" """
from __future__ import annotations from __future__ import annotations
from typing import Generator, Union, List, Type from typing import Generator, Union, List, Type, TYPE_CHECKING
from urllib.parse import urlparse from urllib.parse import urlparse
from ipaddress import ip_address from ipaddress import ip_address
from copy import copy from copy import copy
@ -22,12 +22,14 @@ from rich_argparse import RichHelpFormatter
from .metadata import Metadata, Media from .metadata import Metadata, Media
from auto_archiver.version import __version__ from auto_archiver.version import __version__
from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser from .config import _yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
from .module import available_modules, LazyBaseModule, get_module, setup_paths from .module import ModuleFactory, LazyBaseModule
from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher from . import validators, Feeder, Extractor, Database, Storage, Formatter, Enricher
from .module import BaseModule from .consts import MODULE_TYPES
from loguru import logger from loguru import logger
if TYPE_CHECKING:
from .base_module import BaseModule
from .module import LazyBaseModule
DEFAULT_CONFIG_FILE = "orchestration.yaml" DEFAULT_CONFIG_FILE = "orchestration.yaml"
@ -95,6 +97,12 @@ class UniqueAppendAction(argparse.Action):
class ArchivingOrchestrator: class ArchivingOrchestrator:
# instance variables
module_factory: ModuleFactory
setup_finished: bool
logger_id: int
# instance variables, used for convenience to access modules by step
feeders: List[Type[Feeder]] feeders: List[Type[Feeder]]
extractors: List[Type[Extractor]] extractors: List[Type[Extractor]]
enrichers: List[Type[Enricher]] enrichers: List[Type[Enricher]]
@ -102,6 +110,11 @@ class ArchivingOrchestrator:
storages: List[Type[Storage]] storages: List[Type[Storage]]
formatters: List[Type[Formatter]] formatters: List[Type[Formatter]]
def __init__(self):
self.module_factory = ModuleFactory()
self.setup_finished = False
self.logger_id = None
def setup_basic_parser(self): def setup_basic_parser(self):
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog="auto-archiver", prog="auto-archiver",
@ -133,7 +146,7 @@ class ArchivingOrchestrator:
) )
self.add_modules_args(modules_parser) self.add_modules_args(modules_parser)
cli_modules, unused_args = modules_parser.parse_known_args(unused_args) cli_modules, unused_args = modules_parser.parse_known_args(unused_args)
for module_type in BaseModule.MODULE_TYPES: for module_type in MODULE_TYPES:
yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", []) yaml_config['steps'][f"{module_type}s"] = getattr(cli_modules, f"{module_type}s", []) or yaml_config['steps'].get(f"{module_type}s", [])
parser = DefaultValidatingParser( parser = DefaultValidatingParser(
@ -155,15 +168,15 @@ class ArchivingOrchestrator:
# TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty? # TODO: if some steps are empty (e.g. 'feeders' is empty), should we default to the 'simple' ones? Or only if they are ALL empty?
enabled_modules = [] enabled_modules = []
# first loads the modules from the config file, then from the command line # first loads the modules from the config file, then from the command line
for module_type in BaseModule.MODULE_TYPES: for module_type in MODULE_TYPES:
enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", [])) enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))
# clear out duplicates, but keep the order # clear out duplicates, but keep the order
enabled_modules = list(dict.fromkeys(enabled_modules)) enabled_modules = list(dict.fromkeys(enabled_modules))
avail_modules = available_modules(with_manifest=True, limit_to_modules=enabled_modules, suppress_warnings=True) avail_modules = self.module_factory.available_modules(limit_to_modules=enabled_modules, suppress_warnings=True)
self.add_individual_module_args(avail_modules, parser) self.add_individual_module_args(avail_modules, parser)
elif basic_config.mode == 'simple': elif basic_config.mode == 'simple':
simple_modules = [module for module in available_modules(with_manifest=True) if not module.requires_setup] simple_modules = [module for module in self.module_factory.available_modules() if not module.requires_setup]
self.add_individual_module_args(simple_modules, parser) self.add_individual_module_args(simple_modules, parser)
# for simple mode, we use the cli_feeder and any modules that don't require setup # for simple mode, we use the cli_feeder and any modules that don't require setup
@ -176,7 +189,7 @@ class ArchivingOrchestrator:
yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name) yaml_config['steps'].setdefault(f"{module_type}s", []).append(module.name)
else: else:
# load all modules, they're not using the 'simple' mode # load all modules, they're not using the 'simple' mode
self.add_individual_module_args(available_modules(with_manifest=True), parser) self.add_individual_module_args(self.module_factory.available_modules(), parser)
parser.set_defaults(**to_dot_notation(yaml_config)) parser.set_defaults(**to_dot_notation(yaml_config))
@ -206,7 +219,7 @@ class ArchivingOrchestrator:
parser = self.parser parser = self.parser
# Module loading from the command line # Module loading from the command line
for module_type in BaseModule.MODULE_TYPES: for module_type in MODULE_TYPES:
parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction) parser.add_argument(f'--{module_type}s', dest=f'{module_type}s', nargs='+', help=f'the {module_type}s to use', default=[], action=UniqueAppendAction)
def add_additional_args(self, parser: argparse.ArgumentParser = None): def add_additional_args(self, parser: argparse.ArgumentParser = None):
@ -232,7 +245,7 @@ class ArchivingOrchestrator:
def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None: def add_individual_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None) -> None:
if not modules: if not modules:
modules = available_modules(with_manifest=True) modules = self.module_factory.available_modules()
for module in modules: for module in modules:
@ -274,11 +287,18 @@ class ArchivingOrchestrator:
def setup_logging(self, config): def setup_logging(self, config):
# setup loguru logging # setup loguru logging
logger.remove(0) # remove the default logger try:
logger.remove(0) # remove the default logger
except ValueError:
pass
logging_config = config['logging'] logging_config = config['logging']
logger.add(sys.stderr, level=logging_config['level'])
if log_file := logging_config['file']: # add other logging info
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation']) if self.logger_id is None: # note - need direct comparison to None since need to consider falsy value 0
self.logger_id = logger.add(sys.stderr, level=logging_config['level'])
if log_file := logging_config['file']:
logger.add(log_file) if not logging_config['rotation'] else logger.add(log_file, rotation=logging_config['rotation'])
def install_modules(self, modules_by_type): def install_modules(self, modules_by_type):
""" """
@ -288,7 +308,7 @@ class ArchivingOrchestrator:
""" """
invalid_modules = [] invalid_modules = []
for module_type in BaseModule.MODULE_TYPES: for module_type in MODULE_TYPES:
step_items = [] step_items = []
modules_to_load = modules_by_type[f"{module_type}s"] modules_to_load = modules_by_type[f"{module_type}s"]
@ -333,7 +353,7 @@ class ArchivingOrchestrator:
if module in invalid_modules: if module in invalid_modules:
continue continue
try: try:
loaded_module: BaseModule = get_module(module, self.config) loaded_module: BaseModule = self.module_factory.get_module(module, self.config)
except (KeyboardInterrupt, Exception) as e: except (KeyboardInterrupt, Exception) as e:
logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}") logger.error(f"Error during setup of modules: {e}\n{traceback.format_exc()}")
if module_type == 'extractor' and loaded_module.name == module: if module_type == 'extractor' and loaded_module.name == module:
@ -359,14 +379,17 @@ class ArchivingOrchestrator:
def setup_config(self, args: list) -> dict: def setup_config(self, args: list) -> dict:
""" """
Sets up the configuration file, merging the default config with the user's config Sets up the configuration file, merging the default config with the user's config
This function should only ever be run once.
""" """
self.setup_basic_parser() self.setup_basic_parser()
# parse the known arguments for now (basically, we want the config file) # parse the known arguments for now (basically, we want the config file)
basic_config, unused_args = self.basic_parser.parse_known_args(args) basic_config, unused_args = self.basic_parser.parse_known_args(args)
# setup any custom module paths, so they'll show in the help and for arg parsing # setup any custom module paths, so they'll show in the help and for arg parsing
setup_paths(basic_config.module_paths) self.module_factory.setup_paths(basic_config.module_paths)
# if help flag was called, then show the help # if help flag was called, then show the help
if basic_config.help: if basic_config.help:
@ -378,16 +401,29 @@ class ArchivingOrchestrator:
def setup(self, args: list): def setup(self, args: list):
""" """
Main entry point for the orchestrator, sets up the basic parser, loads the config file, and sets up the complete parser Function to configure all setup of the orchestrator: setup configs and load modules.
This method should only ever be called once
""" """
if self.setup_finished:
logger.warning("The `setup_config()` function should only ever be run once. \
If you need to re-run the setup, please re-instantiate a new instance of the orchestrator. \
For code implementatations, you should call .setup_config() once then you may call .feed() \
multiple times to archive multiple URLs.")
return
self.setup_basic_parser()
self.config = self.setup_config(args) self.config = self.setup_config(args)
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========") logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
self.install_modules(self.config['steps']) self.install_modules(self.config['steps'])
# log out the modules that were loaded # log out the modules that were loaded
for module_type in BaseModule.MODULE_TYPES: for module_type in MODULE_TYPES:
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s"))) logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in getattr(self, f"{module_type}s")))
self.setup_finished = True
def _command_line_run(self, args: list) -> Generator[Metadata]: def _command_line_run(self, args: list) -> Generator[Metadata]:
""" """

Wyświetl plik

@ -14,7 +14,7 @@ from auto_archiver.utils.misc import random_str
from auto_archiver.core import Media, BaseModule, Metadata from auto_archiver.core import Media, BaseModule, Metadata
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
from auto_archiver.core.module import get_module
class Storage(BaseModule): class Storage(BaseModule):
""" """
@ -74,7 +74,7 @@ class Storage(BaseModule):
filename = random_str(24) filename = random_str(24)
elif filename_generator == "static": elif filename_generator == "static":
# load the hash_enricher module # load the hash_enricher module
he = get_module(HashEnricher, self.config) he = self.module_factory.get_module(HashEnricher, self.config)
hd = he.calculate_hash(media.filename) hd = he.calculate_hash(media.filename)
filename = hd[:24] filename = hd[:24]
else: else:

Wyświetl plik

@ -10,7 +10,6 @@ from auto_archiver.version import __version__
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media
from auto_archiver.core import Formatter from auto_archiver.core import Formatter
from auto_archiver.utils.misc import random_str from auto_archiver.utils.misc import random_str
from auto_archiver.core.module import get_module
class HtmlFormatter(Formatter): class HtmlFormatter(Formatter):
environment: Environment = None environment: Environment = None
@ -50,7 +49,7 @@ class HtmlFormatter(Formatter):
final_media = Media(filename=html_path, _mimetype="text/html") final_media = Media(filename=html_path, _mimetype="text/html")
# get the already instantiated hash_enricher module # get the already instantiated hash_enricher module
he = get_module('hash_enricher', self.config) he = self.module_factory.get_module('hash_enricher', self.config)
if len(hd := he.calculate_hash(final_media.filename)): if len(hd := he.calculate_hash(final_media.filename)):
final_media.set("hash", f"{he.algorithm}:{hd}") final_media.set("hash", f"{he.algorithm}:{hd}")

Wyświetl plik

@ -4,7 +4,6 @@ from loguru import logger
from auto_archiver.core import Enricher from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media
from auto_archiver.core.module import get_module
class WhisperEnricher(Enricher): class WhisperEnricher(Enricher):
""" """
@ -15,7 +14,7 @@ class WhisperEnricher(Enricher):
def setup(self) -> None: def setup(self) -> None:
self.stores = self.config['steps']['storages'] self.stores = self.config['steps']['storages']
self.s3 = get_module("s3_storage", self.config) self.s3 = self.module_factory.get_module("s3_storage", self.config)
if not "s3_storage" in self.stores: if not "s3_storage" in self.stores:
logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.") logger.error("WhisperEnricher: To use the WhisperEnricher you need to use S3Storage so files are accessible publicly to the whisper service being called.")
return return

Wyświetl plik

@ -10,7 +10,7 @@ import hashlib
import pytest import pytest
from auto_archiver.core.metadata import Metadata from auto_archiver.core.metadata import Metadata
from auto_archiver.core.module import get_module, _LAZY_LOADED_MODULES from auto_archiver.core.module import ModuleFactory
# Test names inserted into this list will be run last. This is useful for expensive/costly tests # Test names inserted into this list will be run last. This is useful for expensive/costly tests
# that you only want to run if everything else succeeds (e.g. API calls). The order here is important # that you only want to run if everything else succeeds (e.g. API calls). The order here is important
@ -22,19 +22,19 @@ TESTS_TO_RUN_LAST = ['test_twitter_api_archiver']
def setup_module(request): def setup_module(request):
def _setup_module(module_name, config={}): def _setup_module(module_name, config={}):
module_factory = ModuleFactory()
if isinstance(module_name, type): if isinstance(module_name, type):
# get the module name: # get the module name:
# if the class does not have a .name, use the name of the parent folder # if the class does not have a .name, use the name of the parent folder
module_name = module_name.__module__.rsplit(".",2)[-2] module_name = module_name.__module__.rsplit(".",2)[-2]
m = get_module(module_name, {module_name: config}) m = module_factory.get_module(module_name, {module_name: config})
# add the tmp_dir to the module # add the tmp_dir to the module
tmp_dir = TemporaryDirectory() tmp_dir = TemporaryDirectory()
m.tmp_dir = tmp_dir.name m.tmp_dir = tmp_dir.name
def cleanup(): def cleanup():
_LAZY_LOADED_MODULES.pop(module_name)
tmp_dir.cleanup() tmp_dir.cleanup()
request.addfinalizer(cleanup) request.addfinalizer(cleanup)

Wyświetl plik

@ -2,7 +2,7 @@ import pytest
from auto_archiver.modules.hash_enricher import HashEnricher from auto_archiver.modules.hash_enricher import HashEnricher
from auto_archiver.core import Metadata, Media from auto_archiver.core import Metadata, Media
from auto_archiver.core.module import get_module_lazy from auto_archiver.core.module import ModuleFactory
@pytest.mark.parametrize("algorithm, filename, expected_hash", [ @pytest.mark.parametrize("algorithm, filename, expected_hash", [
("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"), ("SHA-256", "tests/data/testfile_1.txt", "1b4f0e9851971998e732078544c96b36c3d01cedf7caa332359d6f1d83567014"),
@ -22,7 +22,7 @@ def test_default_config_values(setup_module):
def test_config(): def test_config():
# test default config # test default config
c = get_module_lazy('hash_enricher').configs c = ModuleFactory().get_module_lazy('hash_enricher').configs
assert c["algorithm"]["default"] == "SHA-256" assert c["algorithm"]["default"] == "SHA-256"
assert c["chunksize"]["default"] == 16000000 assert c["chunksize"]["default"] == 16000000
assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"] assert c["algorithm"]["choices"] == ["SHA-256", "SHA3-512"]

Wyświetl plik

@ -1,24 +1,18 @@
import sys import sys
import pytest import pytest
from auto_archiver.core.module import get_module_lazy, BaseModule, LazyBaseModule, _LAZY_LOADED_MODULES from auto_archiver.core.module import ModuleFactory, LazyBaseModule
from auto_archiver.core.base_module import BaseModule
@pytest.fixture @pytest.fixture
def example_module(): def example_module():
import auto_archiver import auto_archiver
module_factory = ModuleFactory()
previous_path = auto_archiver.modules.__path__ previous_path = auto_archiver.modules.__path__
auto_archiver.modules.__path__.append("tests/data/test_modules/") auto_archiver.modules.__path__.append("tests/data/test_modules/")
module = get_module_lazy("example_module") return module_factory.get_module_lazy("example_module")
yield module
# cleanup
try:
del module._manifest
except AttributeError:
pass
del _LAZY_LOADED_MODULES["example_module"]
sys.modules.pop("auto_archiver.modules.example_module.example_module", None)
auto_archiver.modules.__path__ = previous_path
def test_get_module_lazy(example_module): def test_get_module_lazy(example_module):
assert example_module.name == "example_module" assert example_module.name == "example_module"
@ -46,12 +40,14 @@ def test_module_dependency_check_loads_module(example_module):
# monkey patch the manifest to include a nonexistnet dependency # monkey patch the manifest to include a nonexistnet dependency
example_module.manifest["dependencies"]["python"] = ["hash_enricher"] example_module.manifest["dependencies"]["python"] = ["hash_enricher"]
module_factory = example_module.module_factory
loaded_module = example_module.load({}) loaded_module = example_module.load({})
assert loaded_module is not None assert loaded_module is not None
# check the dependency is loaded # check the dependency is loaded
assert _LAZY_LOADED_MODULES["hash_enricher"] is not None assert module_factory._lazy_modules["hash_enricher"] is not None
assert _LAZY_LOADED_MODULES["hash_enricher"]._instance is not None assert module_factory._lazy_modules["hash_enricher"]._instance is not None
def test_load_module(example_module): def test_load_module(example_module):
@ -69,7 +65,7 @@ def test_load_module(example_module):
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"]) @pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
def test_load_modules(module_name): def test_load_modules(module_name):
# test that specific modules can be loaded # test that specific modules can be loaded
module = get_module_lazy(module_name) module = ModuleFactory().get_module_lazy(module_name)
assert module is not None assert module is not None
assert isinstance(module, LazyBaseModule) assert isinstance(module, LazyBaseModule)
assert module.name == module_name assert module.name == module_name
@ -86,7 +82,7 @@ def test_load_modules(module_name):
@pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"]) @pytest.mark.parametrize("module_name", ["local_storage", "generic_extractor", "html_formatter", "csv_db"])
def test_lazy_base_module(module_name): def test_lazy_base_module(module_name):
lazy_module = get_module_lazy(module_name) lazy_module = ModuleFactory().get_module_lazy(module_name)
assert lazy_module is not None assert lazy_module is not None
assert isinstance(lazy_module, LazyBaseModule) assert isinstance(lazy_module, LazyBaseModule)

Wyświetl plik

@ -4,7 +4,7 @@ from argparse import ArgumentParser, ArgumentTypeError
from auto_archiver.core.orchestrator import ArchivingOrchestrator from auto_archiver.core.orchestrator import ArchivingOrchestrator
from auto_archiver.version import __version__ from auto_archiver.version import __version__
from auto_archiver.core.config import read_yaml, store_yaml from auto_archiver.core.config import read_yaml, store_yaml
from auto_archiver.core.module import _LAZY_LOADED_MODULES
TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml" TEST_ORCHESTRATION = "tests/data/test_orchestration.yaml"
TEST_MODULES = "tests/data/test_modules/" TEST_MODULES = "tests/data/test_modules/"
@ -17,22 +17,7 @@ def test_args():
@pytest.fixture @pytest.fixture
def orchestrator(): def orchestrator():
yield ArchivingOrchestrator() return ArchivingOrchestrator()
# hack - the loguru logger starts with one logger, but if orchestrator has run before
# it'll remove the default logger, add it back in:
from loguru import logger
if not logger._core.handlers.get(0):
logger._core.handlers_count = 0
logger.add(sys.stderr)
# and remove the custom logger
if logger._core.handlers.get(1):
logger.remove(1)
# delete out any loaded modules
_LAZY_LOADED_MODULES.clear()
@pytest.fixture @pytest.fixture
def basic_parser(orchestrator) -> ArgumentParser: def basic_parser(orchestrator) -> ArgumentParser: