Further fixes/changes to loading 'types' for config + manifest edits

pull/224/head
Patrick Robertson 2025-01-27 11:48:04 +01:00
rodzic 14e2479599
commit 7fd95866a1
29 zmienionych plików z 39 dodań i 143 usunięć

Wyświetl plik

@ -10,13 +10,6 @@ from auto_archiver.core import Metadata, Step
class Database(Step, ABC): class Database(Step, ABC):
name = "database" name = "database"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def init(name: str, config: dict) -> Database:
# only for typing...
return Step.init(name, config, Database)
def started(self, item: Metadata) -> None: def started(self, item: Metadata) -> None:
"""signals the DB that the given item archival has started""" """signals the DB that the given item archival has started"""

Wyświetl plik

@ -18,14 +18,5 @@ class Enricher(Step, ABC):
"""Base classes and utilities for enrichers in the Auto-Archiver system.""" """Base classes and utilities for enrichers in the Auto-Archiver system."""
name = "enricher" name = "enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
# only for typing...
def init(name: str, config: dict) -> Enricher:
return Step.init(name, config, Enricher)
@abstractmethod @abstractmethod
def enrich(self, to_enrich: Metadata) -> None: pass def enrich(self, to_enrich: Metadata) -> None: pass

Wyświetl plik

@ -9,13 +9,5 @@ from auto_archiver.core import Step
class Feeder(Step): class Feeder(Step):
name = "feeder" name = "feeder"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def init(name: str, config: dict) -> Feeder:
# only for code typing
return Step.init(name, config, Feeder)
@abstractmethod @abstractmethod
def __iter__(self) -> Metadata: return None def __iter__(self) -> Metadata: return None

Wyświetl plik

@ -6,16 +6,12 @@ flexible setup in various environments.
""" """
import argparse import argparse
from ruamel.yaml import YAML, CommentedMap from ruamel.yaml import YAML, CommentedMap, add_representer
from ruamel.yaml.comments import CommentedMap
from dataclasses import dataclass, field
from collections import OrderedDict
from collections.abc import Iterable
from copy import deepcopy from copy import deepcopy
from .loader import MODULE_TYPES from .loader import MODULE_TYPES
from typing import Any, List from typing import Any, List, Type
# configurable_parents = [ # configurable_parents = [
# Feeder, # Feeder,

Wyświetl plik

@ -152,12 +152,12 @@ class ArchivingOrchestrator:
if not modules: if not modules:
modules = available_modules(with_manifest=True) modules = available_modules(with_manifest=True)
module: Module
for module in modules: for module in modules:
if not module.configs: if not module.configs:
# this module has no configs, don't show anything in the help # this module has no configs, don't show anything in the help
# (TODO: do we want to show something about this module though, like a description?) # (TODO: do we want to show something about this module though, like a description?)
continue continue
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...") group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
for name, kwargs in module.configs.items(): for name, kwargs in module.configs.items():
# TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
@ -165,8 +165,8 @@ class ArchivingOrchestrator:
kwargs.pop('cli_set', None) kwargs.pop('cli_set', None)
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}" kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
try: try:
kwargs['type'] = getattr(__builtins__, kwargs.get('type', 'str')) kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
except AttributeError: except KeyError:
kwargs['type'] = getattr(validators, kwargs['type']) kwargs['type'] = getattr(validators, kwargs['type'])
group.add_argument(f"--{module.name}.{name}", **kwargs) group.add_argument(f"--{module.name}.{name}", **kwargs)
@ -207,7 +207,7 @@ class ArchivingOrchestrator:
exit() exit()
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1: if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
logger.error(f"Only one feeder is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}") logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
exit() exit()
for i, module in enumerate(modules_to_load): for i, module in enumerate(modules_to_load):

Wyświetl plik

@ -5,10 +5,12 @@
"external_dependencies": { "external_dependencies": {
"python": ["loguru"], "python": ["loguru"],
}, },
'entry_point': 'cli_feeder::CLIFeeder',
"configs": { "configs": {
"urls": { "urls": {
"default": None, "default": None,
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml", "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
"nargs": "+",
}, },
}, },
"description": """ "description": """

Wyświetl plik

@ -7,12 +7,6 @@ from auto_archiver.core import Metadata, ArchivingContext
class CLIFeeder(Feeder): class CLIFeeder(Feeder):
name = "cli_feeder" name = "cli_feeder"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
if type(self.urls) != list or len(self.urls) == 0:
raise Exception("CLI Feeder did not receive any URL to process")
def __iter__(self) -> Metadata: def __iter__(self) -> Metadata:
for url in self.urls: for url in self.urls:
logger.debug(f"Processing {url}") logger.debug(f"Processing {url}")

Wyświetl plik

@ -8,11 +8,6 @@ class ConsoleDb(Database):
""" """
Outputs results to the console Outputs results to the console
""" """
name = "console_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def started(self, item: Metadata) -> None: def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}") logger.warning(f"STARTED {item}")

Wyświetl plik

@ -4,6 +4,7 @@
"requires_setup": False, "requires_setup": False,
"external_dependencies": {"python": ["loguru"] "external_dependencies": {"python": ["loguru"]
}, },
'entry_point': 'csv_db::CSVDb',
"configs": { "configs": {
"csv_file": {"default": "db.csv", "help": "CSV file name"} "csv_file": {"default": "db.csv", "help": "CSV file name"}
}, },

Wyświetl plik

@ -11,13 +11,6 @@ class CSVDb(Database):
""" """
Outputs results to a CSV file Outputs results to a CSV file
""" """
name = "csv_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.assert_valid_string("csv_file")
def done(self, item: Metadata, cached: bool=False) -> None: def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB""" """archival result ready - should be saved to DB"""

Wyświetl plik

@ -6,6 +6,8 @@
"python": ["loguru"], "python": ["loguru"],
"bin": [""] "bin": [""]
}, },
'requires_setup': True,
'entry_point': "csv_feeder::CSVFeeder",
"configs": { "configs": {
"files": { "files": {
"default": None, "default": None,

Wyświetl plik

@ -2,7 +2,7 @@
'name': 'Generic Extractor', 'name': 'Generic Extractor',
'version': '0.1.0', 'version': '0.1.0',
'author': 'Bellingcat', 'author': 'Bellingcat',
'type': ['extractor', 'feeder', 'enricher'], 'type': ['extractor'],
'requires_setup': False, 'requires_setup': False,
'dependencies': { 'dependencies': {
'python': ['yt_dlp', 'requests', 'loguru', 'slugify'], 'python': ['yt_dlp', 'requests', 'loguru', 'slugify'],

Wyświetl plik

@ -1,7 +1,7 @@
{ {
"name": "Google Sheets Feeder", "name": "Google Sheets Feeder",
"type": ["feeder"], "type": ["feeder"],
"entry_point": "GsheetsFeeder", "entry_point": "gsheet_feeder::GsheetsFeeder",
"requires_setup": True, "requires_setup": True,
"external_dependencies": { "external_dependencies": {
"python": ["loguru", "gspread", "python-slugify"], "python": ["loguru", "gspread", "python-slugify"],

Wyświetl plik

@ -8,7 +8,10 @@
"configs": { "configs": {
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]}, "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
# TODO add non-negative requirement to match previous implementation? # TODO add non-negative requirement to match previous implementation?
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"}, "chunksize": {"default": 1.6e7,
"help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB",
'type': 'positive_number',
},
}, },
"description": """ "description": """
Generates cryptographic hashes for media files to ensure data integrity and authenticity. Generates cryptographic hashes for media files to ensure data integrity and authenticity.

Wyświetl plik

@ -18,36 +18,6 @@ class HashEnricher(Enricher):
""" """
Calculates hashes for Media instances Calculates hashes for Media instances
""" """
name = "hash_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
algos = self.configs()["algorithm"]
algo_choices = algos["choices"]
if not getattr(self, 'algorithm', None):
if not config.get('algorithm'):
logger.warning(f"No hash algorithm selected, defaulting to {algos['default']}")
self.algorithm = algos["default"]
else:
self.algorithm = config["algorithm"]
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
if not getattr(self, 'chunksize', None):
if config.get('chunksize'):
self.chunksize = config["chunksize"]
else:
self.chunksize = self.configs()["chunksize"]["default"]
try:
self.chunksize = int(self.chunksize)
except ValueError:
raise ValueError(f"Invalid chunksize value: {self.chunksize}. Must be an integer.")
assert self.chunksize >= -1, "read length must be non-negative or -1"
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() url = to_enrich.get_url()

Wyświetl plik

@ -16,17 +16,17 @@ from auto_archiver.utils.misc import random_str
@dataclass @dataclass
class HtmlFormatter(Formatter): class HtmlFormatter(Formatter):
name = "html_formatter"
def __init__(self, config: dict) -> None: # TODO: fix setting up template with new config method
# without this STEP.__init__ is not called # def __init__(self, config: dict) -> None:
super().__init__(config) # # without this STEP.__init__ is not called
self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True) # super().__init__(config)
# JinjaHelper class static methods are added as filters # self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
self.environment.filters.update({ # # JinjaHelper class static methods are added as filters
k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod) # self.environment.filters.update({
}) # k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
self.template = self.environment.get_template("html_template.html") # })
# self.template = self.environment.get_template("html_template.html")
def format(self, item: Metadata) -> Media: def format(self, item: Metadata) -> Media:
url = item.get_url() url = item.get_url()

Wyświetl plik

@ -1 +1 @@
from .local import LocalStorage from .local_storage import LocalStorage

Wyświetl plik

@ -11,9 +11,10 @@ from auto_archiver.base_processors import Storage
class LocalStorage(Storage): class LocalStorage(Storage):
name = "local_storage" name = "local_storage"
def __init__(self, config: dict) -> None: def __init__(self) -> None:
super().__init__(config) super().__init__()
os.makedirs(self.save_to, exist_ok=True) # TODO: fix up passing config values to 'steps'
# os.makedirs(self.save_to, exist_ok=True)
def get_cdn_url(self, media: Media) -> str: def get_cdn_url(self, media: Media) -> str:
# TODO: is this viable with Storage.configs on path/filename? # TODO: is this viable with Storage.configs on path/filename?

Wyświetl plik

@ -10,12 +10,6 @@ class MetaEnricher(Enricher):
""" """
Adds metadata information about the archive operations, to be included at the end of all enrichments Adds metadata information about the archive operations, to be included at the end of all enrichments
""" """
name = "meta_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() url = to_enrich.get_url()

Wyświetl plik

@ -10,11 +10,6 @@ class MetadataEnricher(Enricher):
""" """
Extracts metadata information from files using exiftool. Extracts metadata information from files using exiftool.
""" """
name = "metadata_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:

Wyświetl plik

@ -1,7 +1,7 @@
m = { {
"name": "Mute Formatter", "name": "Mute Formatter",
"type": ["formatter"], "type": ["formatter"],
"requires_setup": False, "requires_setup": True,
"external_dependencies": { "external_dependencies": {
}, },
"description": """ Default formatter. "description": """ Default formatter.

Wyświetl plik

@ -1,16 +1,12 @@
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
from ..core import Metadata, Media from auto_archiver.core import Metadata, Media
from . import Formatter from auto_archiver.base_processors import Formatter
@dataclass @dataclass
class MuteFormatter(Formatter): class MuteFormatter(Formatter):
name = "mute_formatter" name = "mute_formatter"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def format(self, item: Metadata) -> Media: return None def format(self, item: Metadata) -> Media: return None

Wyświetl plik

@ -25,11 +25,6 @@ class PdqHashEnricher(Enricher):
Calculates perceptual hashes for Media instances using PDQ, allowing for (near-)duplicate detection. Calculates perceptual hashes for Media instances using PDQ, allowing for (near-)duplicate detection.
Ideally this enrichment is orchestrated to run after the thumbnail_enricher. Ideally this enrichment is orchestrated to run after the thumbnail_enricher.
""" """
name = "pdq_hash_enricher"
def __init__(self, config: dict) -> None:
# Without this STEP.__init__ is not called
super().__init__(config)
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url() url = to_enrich.get_url()

Wyświetl plik

@ -20,7 +20,7 @@
"region": {"default": None, "help": "S3 region name"}, "region": {"default": None, "help": "S3 region name"},
"key": {"default": None, "help": "S3 API key"}, "key": {"default": None, "help": "S3 API key"},
"secret": {"default": None, "help": "S3 API secret"}, "secret": {"default": None, "help": "S3 API secret"},
"random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"}, "random_no_duplicate": {"default": False, "help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
"endpoint_url": { "endpoint_url": {
"default": 'https://{region}.digitaloceanspaces.com', "default": 'https://{region}.digitaloceanspaces.com',
"help": "S3 bucket endpoint, {region} are inserted at runtime" "help": "S3 bucket endpoint, {region} are inserted at runtime"

Wyświetl plik

@ -5,6 +5,7 @@
"external_dependencies": { "external_dependencies": {
"python": ["loguru", "python-slugify"], "python": ["loguru", "python-slugify"],
}, },
'entry_point': 'ssl_enricher::SSLEnricher',
"configs": { "configs": {
"skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"}, "skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
}, },

Wyświetl plik

@ -11,11 +11,6 @@ class SSLEnricher(Enricher):
""" """
Retrieves SSL certificate information for a domain, as a file Retrieves SSL certificate information for a domain, as a file
""" """
name = "ssl_enricher"
def __init__(self, config: dict) -> None:
super().__init__(config)
self.skip_when_nothing_archived = bool(self.skip_when_nothing_archived)
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
if not to_enrich.media and self.skip_when_nothing_archived: return if not to_enrich.media and self.skip_when_nothing_archived: return

Wyświetl plik

@ -11,11 +11,6 @@ class TelegramExtractor(Extractor):
Extractor for telegram that does not require login, but the telethon_extractor is much more advised, Extractor for telegram that does not require login, but the telethon_extractor is much more advised,
will only return if at least one image or one video is found will only return if at least one image or one video is found
""" """
name = "telegram_extractor"
def __init__(self, config: dict) -> None:
super().__init__(config)
def download(self, item: Metadata) -> Metadata: def download(self, item: Metadata) -> Metadata:
url = item.get_url() url = item.get_url()

Wyświetl plik

@ -1,4 +1,3 @@
import json
{ {
"name": "telethon_extractor", "name": "telethon_extractor",
"type": ["extractor"], "type": ["extractor"],

Wyświetl plik

@ -18,13 +18,6 @@ class ThumbnailEnricher(Enricher):
""" """
Generates thumbnails for all the media Generates thumbnails for all the media
""" """
name = "thumbnail_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.thumbnails_per_second = int(self.thumbnails_per_minute) / 60
self.max_thumbnails = int(self.max_thumbnails)
def enrich(self, to_enrich: Metadata) -> None: def enrich(self, to_enrich: Metadata) -> None:
""" """