Further fixes/changes to loading 'types' for config + manifest edits

pull/224/head
Patrick Robertson 2025-01-27 11:48:04 +01:00
rodzic 14e2479599
commit 7fd95866a1
29 zmienionych plików z 39 dodań i 143 usunięć

Wyświetl plik

@ -10,13 +10,6 @@ from auto_archiver.core import Metadata, Step
class Database(Step, ABC):
name = "database"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def init(name: str, config: dict) -> Database:
# only for typing...
return Step.init(name, config, Database)
def started(self, item: Metadata) -> None:
"""signals the DB that the given item archival has started"""

Wyświetl plik

@ -18,14 +18,5 @@ class Enricher(Step, ABC):
"""Base classes and utilities for enrichers in the Auto-Archiver system."""
name = "enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
# only for typing...
def init(name: str, config: dict) -> Enricher:
return Step.init(name, config, Enricher)
@abstractmethod
def enrich(self, to_enrich: Metadata) -> None: pass

Wyświetl plik

@ -9,13 +9,5 @@ from auto_archiver.core import Step
class Feeder(Step):
name = "feeder"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def init(name: str, config: dict) -> Feeder:
# only for code typing
return Step.init(name, config, Feeder)
@abstractmethod
def __iter__(self) -> Metadata: return None

Wyświetl plik

@ -6,16 +6,12 @@ flexible setup in various environments.
"""
import argparse
from ruamel.yaml import YAML, CommentedMap
from ruamel.yaml.comments import CommentedMap
from ruamel.yaml import YAML, CommentedMap, add_representer
from dataclasses import dataclass, field
from collections import OrderedDict
from collections.abc import Iterable
from copy import deepcopy
from .loader import MODULE_TYPES
from typing import Any, List
from typing import Any, List, Type
# configurable_parents = [
# Feeder,

Wyświetl plik

@ -152,12 +152,12 @@ class ArchivingOrchestrator:
if not modules:
modules = available_modules(with_manifest=True)
module: Module
for module in modules:
if not module.configs:
# this module has no configs, don't show anything in the help
# (TODO: do we want to show something about this module though, like a description?)
continue
group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
for name, kwargs in module.configs.items():
# TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
@ -165,8 +165,8 @@ class ArchivingOrchestrator:
kwargs.pop('cli_set', None)
kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
try:
kwargs['type'] = getattr(__builtins__, kwargs.get('type', 'str'))
except AttributeError:
kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
except KeyError:
kwargs['type'] = getattr(validators, kwargs['type'])
group.add_argument(f"--{module.name}.{name}", **kwargs)
@ -207,7 +207,7 @@ class ArchivingOrchestrator:
exit()
if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
logger.error(f"Only one feeder is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
exit()
for i, module in enumerate(modules_to_load):

Wyświetl plik

@ -5,10 +5,12 @@
"external_dependencies": {
"python": ["loguru"],
},
'entry_point': 'cli_feeder::CLIFeeder',
"configs": {
"urls": {
"default": None,
"help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
"nargs": "+",
},
},
"description": """

Wyświetl plik

@ -7,12 +7,6 @@ from auto_archiver.core import Metadata, ArchivingContext
class CLIFeeder(Feeder):
name = "cli_feeder"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
if type(self.urls) != list or len(self.urls) == 0:
raise Exception("CLI Feeder did not receive any URL to process")
def __iter__(self) -> Metadata:
for url in self.urls:
logger.debug(f"Processing {url}")

Wyświetl plik

@ -8,11 +8,6 @@ class ConsoleDb(Database):
"""
Outputs results to the console
"""
name = "console_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def started(self, item: Metadata) -> None:
logger.warning(f"STARTED {item}")

Wyświetl plik

@ -4,6 +4,7 @@
"requires_setup": False,
"external_dependencies": {"python": ["loguru"]
},
'entry_point': 'csv_db::CSVDb',
"configs": {
"csv_file": {"default": "db.csv", "help": "CSV file name"}
},

Wyświetl plik

@ -11,13 +11,6 @@ class CSVDb(Database):
"""
Outputs results to a CSV file
"""
name = "csv_db"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.assert_valid_string("csv_file")
def done(self, item: Metadata, cached: bool=False) -> None:
"""archival result ready - should be saved to DB"""

Wyświetl plik

@ -6,6 +6,8 @@
"python": ["loguru"],
"bin": [""]
},
'requires_setup': True,
'entry_point': "csv_feeder::CSVFeeder",
"configs": {
"files": {
"default": None,

Wyświetl plik

@ -2,7 +2,7 @@
'name': 'Generic Extractor',
'version': '0.1.0',
'author': 'Bellingcat',
'type': ['extractor', 'feeder', 'enricher'],
'type': ['extractor'],
'requires_setup': False,
'dependencies': {
'python': ['yt_dlp', 'requests', 'loguru', 'slugify'],

Wyświetl plik

@ -1,7 +1,7 @@
{
"name": "Google Sheets Feeder",
"type": ["feeder"],
"entry_point": "GsheetsFeeder",
"entry_point": "gsheet_feeder::GsheetsFeeder",
"requires_setup": True,
"external_dependencies": {
"python": ["loguru", "gspread", "python-slugify"],

Wyświetl plik

@ -8,7 +8,10 @@
"configs": {
"algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
# TODO add non-negative requirement to match previous implementation?
"chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
"chunksize": {"default": 1.6e7,
"help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB",
'type': 'positive_number',
},
},
"description": """
Generates cryptographic hashes for media files to ensure data integrity and authenticity.

Wyświetl plik

@ -18,36 +18,6 @@ class HashEnricher(Enricher):
"""
Calculates hashes for Media instances
"""
name = "hash_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
algos = self.configs()["algorithm"]
algo_choices = algos["choices"]
if not getattr(self, 'algorithm', None):
if not config.get('algorithm'):
logger.warning(f"No hash algorithm selected, defaulting to {algos['default']}")
self.algorithm = algos["default"]
else:
self.algorithm = config["algorithm"]
assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
if not getattr(self, 'chunksize', None):
if config.get('chunksize'):
self.chunksize = config["chunksize"]
else:
self.chunksize = self.configs()["chunksize"]["default"]
try:
self.chunksize = int(self.chunksize)
except ValueError:
raise ValueError(f"Invalid chunksize value: {self.chunksize}. Must be an integer.")
assert self.chunksize >= -1, "read length must be non-negative or -1"
ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()

Wyświetl plik

@ -16,17 +16,17 @@ from auto_archiver.utils.misc import random_str
@dataclass
class HtmlFormatter(Formatter):
name = "html_formatter"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
# JinjaHelper class static methods are added as filters
self.environment.filters.update({
k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
})
self.template = self.environment.get_template("html_template.html")
# TODO: fix setting up template with new config method
# def __init__(self, config: dict) -> None:
# # without this STEP.__init__ is not called
# super().__init__(config)
# self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
# # JinjaHelper class static methods are added as filters
# self.environment.filters.update({
# k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
# })
# self.template = self.environment.get_template("html_template.html")
def format(self, item: Metadata) -> Media:
url = item.get_url()

Wyświetl plik

@ -1 +1 @@
from .local import LocalStorage
from .local_storage import LocalStorage

Wyświetl plik

@ -11,9 +11,10 @@ from auto_archiver.base_processors import Storage
class LocalStorage(Storage):
name = "local_storage"
def __init__(self, config: dict) -> None:
super().__init__(config)
os.makedirs(self.save_to, exist_ok=True)
def __init__(self) -> None:
super().__init__()
# TODO: fix up passing config values to 'steps'
# os.makedirs(self.save_to, exist_ok=True)
def get_cdn_url(self, media: Media) -> str:
# TODO: is this viable with Storage.configs on path/filename?

Wyświetl plik

@ -10,12 +10,6 @@ class MetaEnricher(Enricher):
"""
Adds metadata information about the archive operations, to be included at the end of all enrichments
"""
name = "meta_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()

Wyświetl plik

@ -10,11 +10,6 @@ class MetadataEnricher(Enricher):
"""
Extracts metadata information from files using exiftool.
"""
name = "metadata_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def enrich(self, to_enrich: Metadata) -> None:

Wyświetl plik

@ -1,7 +1,7 @@
m = {
{
"name": "Mute Formatter",
"type": ["formatter"],
"requires_setup": False,
"requires_setup": True,
"external_dependencies": {
},
"description": """ Default formatter.

Wyświetl plik

@ -1,16 +1,12 @@
from __future__ import annotations
from dataclasses import dataclass
from ..core import Metadata, Media
from . import Formatter
from auto_archiver.core import Metadata, Media
from auto_archiver.base_processors import Formatter
@dataclass
class MuteFormatter(Formatter):
name = "mute_formatter"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
def format(self, item: Metadata) -> Media: return None

Wyświetl plik

@ -25,11 +25,6 @@ class PdqHashEnricher(Enricher):
Calculates perceptual hashes for Media instances using PDQ, allowing for (near-)duplicate detection.
Ideally this enrichment is orchestrated to run after the thumbnail_enricher.
"""
name = "pdq_hash_enricher"
def __init__(self, config: dict) -> None:
# Without this STEP.__init__ is not called
super().__init__(config)
def enrich(self, to_enrich: Metadata) -> None:
url = to_enrich.get_url()

Wyświetl plik

@ -20,7 +20,7 @@
"region": {"default": None, "help": "S3 region name"},
"key": {"default": None, "help": "S3 API key"},
"secret": {"default": None, "help": "S3 API secret"},
"random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
"random_no_duplicate": {"default": False, "help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
"endpoint_url": {
"default": 'https://{region}.digitaloceanspaces.com',
"help": "S3 bucket endpoint, {region} are inserted at runtime"

Wyświetl plik

@ -5,6 +5,7 @@
"external_dependencies": {
"python": ["loguru", "python-slugify"],
},
'entry_point': 'ssl_enricher::SSLEnricher',
"configs": {
"skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
},

Wyświetl plik

@ -11,11 +11,6 @@ class SSLEnricher(Enricher):
"""
Retrieves SSL certificate information for a domain, as a file
"""
name = "ssl_enricher"
def __init__(self, config: dict) -> None:
super().__init__(config)
self.skip_when_nothing_archived = bool(self.skip_when_nothing_archived)
def enrich(self, to_enrich: Metadata) -> None:
if not to_enrich.media and self.skip_when_nothing_archived: return

Wyświetl plik

@ -11,11 +11,6 @@ class TelegramExtractor(Extractor):
Extractor for telegram that does not require login, but the telethon_extractor is much more advised,
will only return if at least one image or one video is found
"""
name = "telegram_extractor"
def __init__(self, config: dict) -> None:
super().__init__(config)
def download(self, item: Metadata) -> Metadata:
url = item.get_url()

Wyświetl plik

@ -1,4 +1,3 @@
import json
{
"name": "telethon_extractor",
"type": ["extractor"],
@ -42,4 +41,4 @@ To use the `TelethonExtractor`, you must configure the following:
- **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup.
"""
}
}

Wyświetl plik

@ -18,13 +18,6 @@ class ThumbnailEnricher(Enricher):
"""
Generates thumbnails for all the media
"""
name = "thumbnail_enricher"
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
self.thumbnails_per_second = int(self.thumbnails_per_minute) / 60
self.max_thumbnails = int(self.max_thumbnails)
def enrich(self, to_enrich: Metadata) -> None:
"""