kopia lustrzana https://github.com/bellingcat/auto-archiver
Refactor loader + step into module, use LazyBaseModule and BaseModule
rodzic
7fd95866a1
commit
f68e2726f2
|
@ -3,13 +3,11 @@ from dataclasses import dataclass
|
||||||
from abc import abstractmethod, ABC
|
from abc import abstractmethod, ABC
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from auto_archiver.core import Metadata, Step
|
from auto_archiver.core import Metadata, BaseModule
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Database(Step, ABC):
|
class Database(BaseModule):
|
||||||
|
|
||||||
name = "database"
|
|
||||||
|
|
||||||
def started(self, item: Metadata) -> None:
|
def started(self, item: Metadata) -> None:
|
||||||
"""signals the DB that the given item archival has started"""
|
"""signals the DB that the given item archival has started"""
|
||||||
|
|
|
@ -11,12 +11,11 @@ Enrichers are optional but highly useful for making the archived data more power
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from abc import abstractmethod, ABC
|
from abc import abstractmethod, ABC
|
||||||
from auto_archiver.core import Metadata, Step
|
from auto_archiver.core import Metadata, BaseModule
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Enricher(Step, ABC):
|
class Enricher(BaseModule):
|
||||||
"""Base classes and utilities for enrichers in the Auto-Archiver system."""
|
"""Base classes and utilities for enrichers in the Auto-Archiver system."""
|
||||||
name = "enricher"
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def enrich(self, to_enrich: Metadata) -> None: pass
|
def enrich(self, to_enrich: Metadata) -> None: pass
|
||||||
|
|
|
@ -25,7 +25,7 @@ class Extractor:
|
||||||
Subclasses must implement the `download` method to define platform-specific behavior.
|
Subclasses must implement the `download` method to define platform-specific behavior.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def setup(self) -> None:
|
def setup(self, *args, **kwargs) -> None:
|
||||||
# used when extractors need to login or do other one-time setup
|
# used when extractors need to login or do other one-time setup
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
|
@ -2,12 +2,11 @@ from __future__ import annotations
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from auto_archiver.core import Metadata
|
from auto_archiver.core import Metadata
|
||||||
from auto_archiver.core import Step
|
from auto_archiver.core import BaseModule
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Feeder(Step):
|
class Feeder(BaseModule):
|
||||||
name = "feeder"
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def __iter__(self) -> Metadata: return None
|
def __iter__(self) -> Metadata: return None
|
|
@ -1,20 +1,11 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from auto_archiver.core import Metadata, Media, Step
|
from auto_archiver.core import Metadata, Media, BaseModule
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Formatter(Step):
|
class Formatter(BaseModule):
|
||||||
name = "formatter"
|
|
||||||
|
|
||||||
def __init__(self, config: dict) -> None:
|
|
||||||
# without this STEP.__init__ is not called
|
|
||||||
super().__init__(config)
|
|
||||||
|
|
||||||
def init(name: str, config: dict) -> Formatter:
|
|
||||||
# only for code typing
|
|
||||||
return Step.init(name, config, Formatter)
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def format(self, item: Metadata) -> Media: return None
|
def format(self, item: Metadata) -> Media: return None
|
|
@ -6,19 +6,14 @@ import os
|
||||||
|
|
||||||
from auto_archiver.utils.misc import random_str
|
from auto_archiver.utils.misc import random_str
|
||||||
|
|
||||||
from auto_archiver.core import Media, Step, ArchivingContext, Metadata
|
from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata
|
||||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Storage(Step):
|
class Storage(BaseModule):
|
||||||
name = "storage"
|
|
||||||
|
|
||||||
def init(name: str, config: dict) -> Storage:
|
|
||||||
# only for typing...
|
|
||||||
return Step.init(name, config, Storage)
|
|
||||||
|
|
||||||
def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None:
|
def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None:
|
||||||
if media.is_stored():
|
if media.is_stored():
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
"""
|
"""
|
||||||
from .metadata import Metadata
|
from .metadata import Metadata
|
||||||
from .media import Media
|
from .media import Media
|
||||||
from .step import Step
|
from .module import BaseModule
|
||||||
from .context import ArchivingContext
|
from .context import ArchivingContext
|
||||||
|
|
||||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
||||||
from ruamel.yaml import YAML, CommentedMap, add_representer
|
from ruamel.yaml import YAML, CommentedMap, add_representer
|
||||||
|
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from .loader import MODULE_TYPES
|
from .module import MODULE_TYPES
|
||||||
|
|
||||||
from typing import Any, List, Type
|
from typing import Any, List, Type
|
||||||
|
|
||||||
|
|
|
@ -1,173 +0,0 @@
|
||||||
import ast
|
|
||||||
from typing import Type
|
|
||||||
from importlib.util import find_spec
|
|
||||||
from dataclasses import dataclass
|
|
||||||
import os
|
|
||||||
import copy
|
|
||||||
from os.path import join, dirname
|
|
||||||
from typing import List
|
|
||||||
from loguru import logger
|
|
||||||
import sys
|
|
||||||
import shutil
|
|
||||||
|
|
||||||
_LOADED_MODULES = {}
|
|
||||||
|
|
||||||
MODULE_TYPES = [
|
|
||||||
'feeder',
|
|
||||||
'enricher',
|
|
||||||
'extractor',
|
|
||||||
'database',
|
|
||||||
'storage',
|
|
||||||
'formatter'
|
|
||||||
]
|
|
||||||
|
|
||||||
MANIFEST_FILE = "__manifest__.py"
|
|
||||||
_DEFAULT_MANIFEST = {
|
|
||||||
'name': '',
|
|
||||||
'author': 'Bellingcat',
|
|
||||||
'type': [],
|
|
||||||
'requires_setup': True,
|
|
||||||
'description': '',
|
|
||||||
'dependencies': {},
|
|
||||||
'entry_point': '',
|
|
||||||
'version': '1.0',
|
|
||||||
'configs': {}
|
|
||||||
}
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Module:
|
|
||||||
name: str
|
|
||||||
display_name: str
|
|
||||||
type: list
|
|
||||||
dependencies: dict
|
|
||||||
requires_setup: bool
|
|
||||||
configs: dict
|
|
||||||
description: str
|
|
||||||
path: str
|
|
||||||
manifest: dict
|
|
||||||
|
|
||||||
def __init__(self, module_name, path, manifest):
|
|
||||||
self.name = module_name
|
|
||||||
self.path = path
|
|
||||||
self.manifest = manifest
|
|
||||||
if manifest:
|
|
||||||
self.display_name = manifest['name']
|
|
||||||
self.type = manifest['type']
|
|
||||||
self._entry_point = manifest['entry_point']
|
|
||||||
self.dependencies = manifest['dependencies']
|
|
||||||
self.requires_setup = manifest['requires_setup']
|
|
||||||
self.configs = manifest['configs']
|
|
||||||
self.description = manifest['description']
|
|
||||||
|
|
||||||
@property
|
|
||||||
def entry_point(self):
|
|
||||||
if not self._entry_point:
|
|
||||||
# try to create the entry point from the module name
|
|
||||||
self._entry_point = f"{self.name}::{self.name.replace('_', ' ').title().replace(' ', '')}"
|
|
||||||
return self._entry_point
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"Module<'{self.display_name}' ({self.name})>"
|
|
||||||
|
|
||||||
def load_module(module: str) -> object: # TODO: change return type to Step
|
|
||||||
|
|
||||||
if module in _LOADED_MODULES:
|
|
||||||
return _LOADED_MODULES[module]
|
|
||||||
|
|
||||||
# load a module by name
|
|
||||||
module = get_module(module)
|
|
||||||
if not module:
|
|
||||||
return None
|
|
||||||
# check external dependencies are installed
|
|
||||||
def check_deps(deps, check):
|
|
||||||
for dep in deps:
|
|
||||||
if not check(dep):
|
|
||||||
logger.error(f"Module '{module.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{module.name}' module? See the README for more information.")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
check_deps(module.dependencies.get('python', []), lambda dep: find_spec(dep))
|
|
||||||
check_deps(module.dependencies.get('bin', []), lambda dep: shutil.which(dep))
|
|
||||||
|
|
||||||
qualname = f'auto_archiver.modules.{module.name}'
|
|
||||||
|
|
||||||
logger.info(f"Loading module '{module.display_name}'...")
|
|
||||||
# first import the whole module, to make sure it's working properly
|
|
||||||
__import__(qualname)
|
|
||||||
|
|
||||||
|
|
||||||
# then import the file for the entry point
|
|
||||||
file_name, class_name = module.entry_point.split('::')
|
|
||||||
sub_qualname = f'{qualname}.{file_name}'
|
|
||||||
|
|
||||||
__import__(f'{qualname}.{file_name}', fromlist=[module.entry_point])
|
|
||||||
|
|
||||||
# finally, get the class instance
|
|
||||||
instance = getattr(sys.modules[sub_qualname], class_name)()
|
|
||||||
if not getattr(instance, 'name', None):
|
|
||||||
instance.name = module.name
|
|
||||||
|
|
||||||
_LOADED_MODULES[module.name] = instance
|
|
||||||
return _LOADED_MODULES[module.name]
|
|
||||||
|
|
||||||
|
|
||||||
# finally, load the module
|
|
||||||
|
|
||||||
def load_manifest(module_path):
|
|
||||||
# print(f"Loading manifest for module {module_path}")
|
|
||||||
# load the manifest file
|
|
||||||
manifest = copy.deepcopy(_DEFAULT_MANIFEST)
|
|
||||||
|
|
||||||
with open(join(module_path, MANIFEST_FILE)) as f:
|
|
||||||
try:
|
|
||||||
manifest.update(ast.literal_eval(f.read()))
|
|
||||||
except ( ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e:
|
|
||||||
logger.error(f"Error loading manifest from file {module_path}/{MANIFEST_FILE}: {e}")
|
|
||||||
return manifest
|
|
||||||
return manifest
|
|
||||||
|
|
||||||
def get_module(module_name):
|
|
||||||
# get a module by name
|
|
||||||
try:
|
|
||||||
return available_modules(limit_to_modules=[module_name], with_manifest=True)[0]
|
|
||||||
except IndexError:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], suppress_warnings: bool = False) -> List[Module]:
|
|
||||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
|
||||||
|
|
||||||
# see odoo/modules/module.py -> get_modules
|
|
||||||
def is_really_module(name):
|
|
||||||
if os.path.isfile(join(name, MANIFEST_FILE)):
|
|
||||||
return True
|
|
||||||
|
|
||||||
default_path = [join(dirname(dirname((__file__))), "modules")]
|
|
||||||
all_modules = []
|
|
||||||
|
|
||||||
for module_folder in default_path + additional_paths:
|
|
||||||
# walk through each module in module_folder and check if it has a valid manifest
|
|
||||||
try:
|
|
||||||
possible_modules = os.listdir(module_folder)
|
|
||||||
except FileNotFoundError:
|
|
||||||
logger.warning(f"Module folder {module_folder} does not exist")
|
|
||||||
continue
|
|
||||||
|
|
||||||
for possible_module in possible_modules:
|
|
||||||
if limit_to_modules and possible_module not in limit_to_modules:
|
|
||||||
continue
|
|
||||||
|
|
||||||
possible_module_path = join(module_folder, possible_module)
|
|
||||||
if not is_really_module(possible_module_path):
|
|
||||||
continue
|
|
||||||
# parse manifest and add to list of available modules
|
|
||||||
if with_manifest:
|
|
||||||
manifest = load_manifest(possible_module_path)
|
|
||||||
else:
|
|
||||||
manifest = {}
|
|
||||||
all_modules.append(Module(possible_module, possible_module_path, manifest))
|
|
||||||
|
|
||||||
if not suppress_warnings:
|
|
||||||
for module in limit_to_modules:
|
|
||||||
if not any(module == m.name for m in all_modules):
|
|
||||||
logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
|
|
||||||
|
|
||||||
return all_modules
|
|
|
@ -0,0 +1,196 @@
|
||||||
|
"""
|
||||||
|
Defines the Step abstract base class, which acts as a blueprint for steps in the archiving pipeline
|
||||||
|
by handling user configuration, validating the steps properties, and implementing dynamic instantiation.
|
||||||
|
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List
|
||||||
|
from abc import ABC
|
||||||
|
import shutil
|
||||||
|
import ast
|
||||||
|
import copy
|
||||||
|
import sys
|
||||||
|
from importlib.util import find_spec
|
||||||
|
import os
|
||||||
|
from os.path import join, dirname
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
|
_LAZY_LOADED_MODULES = {}
|
||||||
|
|
||||||
|
MODULE_TYPES = [
|
||||||
|
'feeder',
|
||||||
|
'extractor',
|
||||||
|
'enricher',
|
||||||
|
'database',
|
||||||
|
'storage',
|
||||||
|
'formatter'
|
||||||
|
]
|
||||||
|
|
||||||
|
MANIFEST_FILE = "__manifest__.py"
|
||||||
|
_DEFAULT_MANIFEST = {
|
||||||
|
'name': '',
|
||||||
|
'author': 'Bellingcat',
|
||||||
|
'type': [],
|
||||||
|
'requires_setup': True,
|
||||||
|
'description': '',
|
||||||
|
'dependencies': {},
|
||||||
|
'entry_point': '',
|
||||||
|
'version': '1.0',
|
||||||
|
'configs': {}
|
||||||
|
}
|
||||||
|
|
||||||
|
class BaseModule(ABC):
|
||||||
|
|
||||||
|
config: dict
|
||||||
|
name: str
|
||||||
|
|
||||||
|
def setup(self, config: dict):
|
||||||
|
self.config = config
|
||||||
|
for key, val in config.get(self.name, {}).items():
|
||||||
|
setattr(self, key, val)
|
||||||
|
|
||||||
|
def get_module(module_name: str, additional_paths: List[str] = []):
|
||||||
|
if module_name in _LAZY_LOADED_MODULES:
|
||||||
|
return _LAZY_LOADED_MODULES[module_name]
|
||||||
|
|
||||||
|
module = available_modules(additional_paths=additional_paths, limit_to_modules=[module_name])[0]
|
||||||
|
_LAZY_LOADED_MODULES[module_name] = module
|
||||||
|
return module
|
||||||
|
|
||||||
|
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], additional_paths: List[str] = [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||||
|
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||||
|
|
||||||
|
# see odoo/modules/module.py -> get_modules
|
||||||
|
def is_really_module(module_path):
|
||||||
|
if os.path.isfile(join(module_path, MANIFEST_FILE)):
|
||||||
|
return True
|
||||||
|
|
||||||
|
default_path = [join(dirname(dirname((__file__))), "modules")]
|
||||||
|
all_modules = []
|
||||||
|
|
||||||
|
for module_folder in default_path + additional_paths:
|
||||||
|
# walk through each module in module_folder and check if it has a valid manifest
|
||||||
|
try:
|
||||||
|
possible_modules = os.listdir(module_folder)
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.warning(f"Module folder {module_folder} does not exist")
|
||||||
|
continue
|
||||||
|
|
||||||
|
for possible_module in possible_modules:
|
||||||
|
if limit_to_modules and possible_module not in limit_to_modules:
|
||||||
|
continue
|
||||||
|
|
||||||
|
possible_module_path = join(module_folder, possible_module)
|
||||||
|
if not is_really_module(possible_module_path):
|
||||||
|
continue
|
||||||
|
|
||||||
|
all_modules.append(LazyBaseModule(possible_module, possible_module_path))
|
||||||
|
|
||||||
|
if not suppress_warnings:
|
||||||
|
for module in limit_to_modules:
|
||||||
|
if not any(module == m.name for m in all_modules):
|
||||||
|
logger.warning(f"Module '{module}' not found. Are you sure it's installed?")
|
||||||
|
|
||||||
|
return all_modules
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LazyBaseModule:
|
||||||
|
name: str
|
||||||
|
display_name: str
|
||||||
|
type: list
|
||||||
|
requires_setup: bool
|
||||||
|
description: str
|
||||||
|
path: str
|
||||||
|
|
||||||
|
_manifest: dict = None
|
||||||
|
_instance: BaseModule = None
|
||||||
|
_entry_point: str = None
|
||||||
|
|
||||||
|
def __init__(self, module_name, path):
|
||||||
|
self.name = module_name
|
||||||
|
self.path = path
|
||||||
|
|
||||||
|
@property
|
||||||
|
def entry_point(self):
|
||||||
|
if not self._entry_point and not self.manifest['entry_point']:
|
||||||
|
# try to create the entry point from the module name
|
||||||
|
self._entry_point = f"{self.name}::{self.name.replace('_', ' ').title().replace(' ', '')}"
|
||||||
|
return self._entry_point
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dependencies(self):
|
||||||
|
return self.manifest['dependencies']
|
||||||
|
|
||||||
|
@property
|
||||||
|
def configs(self):
|
||||||
|
return self.manifest['configs']
|
||||||
|
|
||||||
|
@property
|
||||||
|
def manifest(self):
|
||||||
|
if self._manifest:
|
||||||
|
return self._manifest
|
||||||
|
# print(f"Loading manifest for module {module_path}")
|
||||||
|
# load the manifest file
|
||||||
|
manifest = copy.deepcopy(_DEFAULT_MANIFEST)
|
||||||
|
|
||||||
|
with open(join(self.path, MANIFEST_FILE)) as f:
|
||||||
|
try:
|
||||||
|
manifest.update(ast.literal_eval(f.read()))
|
||||||
|
except (ValueError, TypeError, SyntaxError, MemoryError, RecursionError) as e:
|
||||||
|
logger.error(f"Error loading manifest from file {self.path}/{MANIFEST_FILE}: {e}")
|
||||||
|
|
||||||
|
self._manifest = manifest
|
||||||
|
self.display_name = manifest['name']
|
||||||
|
self.type = manifest['type']
|
||||||
|
self._entry_point = manifest['entry_point']
|
||||||
|
self.requires_setup = manifest['requires_setup']
|
||||||
|
self.description = manifest['description']
|
||||||
|
|
||||||
|
return manifest
|
||||||
|
|
||||||
|
def load(self):
|
||||||
|
if self._instance:
|
||||||
|
return self._instance
|
||||||
|
|
||||||
|
# check external dependencies are installed
|
||||||
|
def check_deps(deps, check):
|
||||||
|
for dep in deps:
|
||||||
|
if not check(dep):
|
||||||
|
logger.error(f"Module '{self.name}' requires external dependency '{dep}' which is not available. Have you installed the required dependencies for the '{self.name}' module? See the README for more information.")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
check_deps(self.dependencies.get('python', []), lambda dep: find_spec(dep))
|
||||||
|
check_deps(self.dependencies.get('bin', []), lambda dep: shutil.which(dep))
|
||||||
|
|
||||||
|
|
||||||
|
logger.debug(f"Loading module '{self.display_name}'...")
|
||||||
|
|
||||||
|
for qualname in [self.name, f'auto_archiver.modules.{self.name}']:
|
||||||
|
try:
|
||||||
|
# first import the whole module, to make sure it's working properly
|
||||||
|
__import__(qualname)
|
||||||
|
break
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# then import the file for the entry point
|
||||||
|
file_name, class_name = self.entry_point.split('::')
|
||||||
|
sub_qualname = f'{qualname}.{file_name}'
|
||||||
|
|
||||||
|
__import__(f'{qualname}.{file_name}', fromlist=[self.entry_point])
|
||||||
|
|
||||||
|
# finally, get the class instance
|
||||||
|
instance = getattr(sys.modules[sub_qualname], class_name)()
|
||||||
|
if not getattr(instance, 'name', None):
|
||||||
|
instance.name = self.name
|
||||||
|
|
||||||
|
if not getattr(instance, 'display_name', None):
|
||||||
|
instance.display_name = self.display_name
|
||||||
|
|
||||||
|
self._instance = instance
|
||||||
|
return instance
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"Module<'{self.display_name}' ({self.name})>"
|
|
@ -19,8 +19,9 @@ from .context import ArchivingContext
|
||||||
from .metadata import Metadata
|
from .metadata import Metadata
|
||||||
from ..version import __version__
|
from ..version import __version__
|
||||||
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG
|
from .config import read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG
|
||||||
from .loader import available_modules, Module, MODULE_TYPES, load_module
|
from .module import available_modules, LazyBaseModule, MODULE_TYPES, get_module
|
||||||
from . import validators
|
from . import validators
|
||||||
|
from .module import BaseModule
|
||||||
|
|
||||||
import tempfile, traceback
|
import tempfile, traceback
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
@ -107,7 +108,7 @@ class ArchivingOrchestrator:
|
||||||
else:
|
else:
|
||||||
# load all modules, they're not using the 'simple' mode
|
# load all modules, they're not using the 'simple' mode
|
||||||
self.add_module_args(available_modules(with_manifest=True), parser)
|
self.add_module_args(available_modules(with_manifest=True), parser)
|
||||||
|
|
||||||
parser.set_defaults(**to_dot_notation(yaml_config))
|
parser.set_defaults(**to_dot_notation(yaml_config))
|
||||||
|
|
||||||
# reload the parser with the new arguments, now that we have them
|
# reload the parser with the new arguments, now that we have them
|
||||||
|
@ -147,22 +148,27 @@ class ArchivingOrchestrator:
|
||||||
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
parser.add_argument('--logging.file', action='store', dest='logging.file', help='the logging file to write to', default=None)
|
||||||
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
parser.add_argument('--logging.rotation', action='store', dest='logging.rotation', help='the logging rotation to use', default=None)
|
||||||
|
|
||||||
def add_module_args(self, modules: list[Module] = None, parser: argparse.ArgumentParser = None):
|
# additional modules
|
||||||
|
parser.add_argument('--additional-modules', dest='additional_modules', nargs='+', help='additional paths to search for modules', action=UniqueAppendAction)
|
||||||
|
|
||||||
|
def add_module_args(self, modules: list[LazyBaseModule] = None, parser: argparse.ArgumentParser = None):
|
||||||
|
|
||||||
if not modules:
|
if not modules:
|
||||||
modules = available_modules(with_manifest=True)
|
modules = available_modules(with_manifest=True)
|
||||||
|
|
||||||
module: Module
|
module: LazyBaseModule
|
||||||
for module in modules:
|
for module in modules:
|
||||||
if not module.configs:
|
if not module.configs:
|
||||||
# this module has no configs, don't show anything in the help
|
# this module has no configs, don't show anything in the help
|
||||||
# (TODO: do we want to show something about this module though, like a description?)
|
# (TODO: do we want to show something about this module though, like a description?)
|
||||||
continue
|
continue
|
||||||
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
|
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
|
||||||
|
|
||||||
for name, kwargs in module.configs.items():
|
for name, kwargs in module.configs.items():
|
||||||
# TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
|
# TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
|
||||||
# in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something
|
# in most cases it'll mean replacing it with 'type': 'str' or 'type': 'int' or something
|
||||||
kwargs.pop('cli_set', None)
|
kwargs.pop('cli_set', None)
|
||||||
|
|
||||||
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
|
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
|
||||||
try:
|
try:
|
||||||
kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
|
kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
|
||||||
|
@ -210,10 +216,11 @@ class ArchivingOrchestrator:
|
||||||
logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
for i, module in enumerate(modules_to_load):
|
for module in modules_to_load:
|
||||||
if module in invalid_modules:
|
if module in invalid_modules:
|
||||||
continue
|
continue
|
||||||
loaded_module = load_module(module)
|
loaded_module: BaseModule = get_module(module).load()
|
||||||
|
loaded_module.setup(self.config)
|
||||||
if not loaded_module:
|
if not loaded_module:
|
||||||
invalid_modules.append(module)
|
invalid_modules.append(module)
|
||||||
continue
|
continue
|
||||||
|
@ -238,6 +245,8 @@ class ArchivingOrchestrator:
|
||||||
if basic_config.help:
|
if basic_config.help:
|
||||||
self.show_help()
|
self.show_help()
|
||||||
|
|
||||||
|
logger.info(f"======== Welcome to the AUTO ARCHIVER ({__version__}) ==========")
|
||||||
|
|
||||||
# load the config file
|
# load the config file
|
||||||
yaml_config = {}
|
yaml_config = {}
|
||||||
|
|
||||||
|
@ -252,12 +261,9 @@ class ArchivingOrchestrator:
|
||||||
|
|
||||||
self.install_modules()
|
self.install_modules()
|
||||||
|
|
||||||
logger.info("FEEDERS: " + ", ".join(m.name for m in self.config['steps']['feeders']))
|
# log out the modules that were loaded
|
||||||
logger.info("EXTRACTORS: " + ", ".join(m.name for m in self.config['steps']['extractors']))
|
for module_type in MODULE_TYPES:
|
||||||
logger.info("ENRICHERS: " + ", ".join(m.name for m in self.config['steps']['enrichers']))
|
logger.info(f"{module_type.upper()}S: " + ", ".join(m.display_name for m in self.config['steps'][f"{module_type}s"]))
|
||||||
logger.info("DATABASES: " + ", ".join(m.name for m in self.config['steps']['databases']))
|
|
||||||
logger.info("STORAGES: " + ", ".join(m.name for m in self.config['steps']['storages']))
|
|
||||||
logger.info("FORMATTERS: " + ", ".join(m.name for m in self.config['steps']['formatters']))
|
|
||||||
|
|
||||||
for item in self.feed():
|
for item in self.feed():
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -1,11 +0,0 @@
|
||||||
"""
|
|
||||||
Defines the Step abstract base class, which acts as a blueprint for steps in the archiving pipeline
|
|
||||||
by handling user configuration, validating the steps properties, and implementing dynamic instantiation.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
class Step:
|
|
||||||
# Nothing to see here :)
|
|
||||||
pass
|
|
|
@ -3,3 +3,5 @@
|
||||||
def example_validator(value):
|
def example_validator(value):
|
||||||
return "example" in value
|
return "example" in value
|
||||||
|
|
||||||
|
def positive_number(value):
|
||||||
|
return value > 0
|
|
@ -8,9 +8,9 @@
|
||||||
'entry_point': 'cli_feeder::CLIFeeder',
|
'entry_point': 'cli_feeder::CLIFeeder',
|
||||||
"configs": {
|
"configs": {
|
||||||
"urls": {
|
"urls": {
|
||||||
"default": None,
|
|
||||||
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
|
||||||
"nargs": "+",
|
"nargs": "+",
|
||||||
|
"required": True,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"description": """
|
"description": """
|
||||||
|
|
|
@ -5,11 +5,10 @@ from auto_archiver.core import Metadata, ArchivingContext
|
||||||
|
|
||||||
|
|
||||||
class CLIFeeder(Feeder):
|
class CLIFeeder(Feeder):
|
||||||
name = "cli_feeder"
|
|
||||||
|
|
||||||
def __iter__(self) -> Metadata:
|
def __iter__(self) -> Metadata:
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
logger.debug(f"Processing {url}")
|
logger.debug(f"Processing URL: '{url}'")
|
||||||
yield Metadata().set_url(url)
|
yield Metadata().set_url(url)
|
||||||
ArchivingContext.set("folder", "cli")
|
ArchivingContext.set("folder", "cli")
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"name": "csv_db",
|
"name": "CSV Database",
|
||||||
"type": ["database"],
|
"type": ["database"],
|
||||||
"requires_setup": False,
|
"requires_setup": False,
|
||||||
"external_dependencies": {"python": ["loguru"]
|
"external_dependencies": {"python": ["loguru"]
|
||||||
|
|
Ładowanie…
Reference in New Issue