Further fixes/changes to loading 'types' for config + manifest edits

2025-01-27 11:48:04 +01:00 · 2025-01-27 11:48:04 +01:00 · 7fd95866a1
commit 7fd95866a1
--- a/src/auto_archiver/base_processors/database.py
+++ b/src/auto_archiver/base_processors/database.py
@ -10,13 +10,6 @@ from auto_archiver.core import Metadata, Step
 class Database(Step, ABC):

    name = "database"
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-
-    def init(name: str, config: dict) -> Database:
-        # only for typing...
-        return Step.init(name, config, Database)

    def started(self, item: Metadata) -> None:
        """signals the DB that the given item archival has started"""
--- a/src/auto_archiver/base_processors/enricher.py
+++ b/src/auto_archiver/base_processors/enricher.py
@ -18,14 +18,5 @@ class Enricher(Step, ABC):
    """Base classes and utilities for enrichers in the Auto-Archiver system."""
    name = "enricher"

-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        
-
-    # only for typing...
-    def init(name: str, config: dict) -> Enricher:
-        return Step.init(name, config, Enricher)
-
    @abstractmethod
    def enrich(self, to_enrich: Metadata) -> None: pass
--- a/src/auto_archiver/base_processors/feeder.py
+++ b/src/auto_archiver/base_processors/feeder.py
@ -9,13 +9,5 @@ from auto_archiver.core import Step
 class Feeder(Step):
    name = "feeder"

-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-
-    def init(name: str, config: dict) -> Feeder:
-        # only for code typing
-        return Step.init(name, config, Feeder)
-
    @abstractmethod
    def __iter__(self) -> Metadata: return None
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@ -6,16 +6,12 @@ flexible setup in various environments.
 """

 import argparse
-from ruamel.yaml import YAML, CommentedMap
-from ruamel.yaml.comments import CommentedMap
+from ruamel.yaml import YAML, CommentedMap, add_representer

-from dataclasses import dataclass, field
-from collections import OrderedDict
-from collections.abc import Iterable
 from copy import deepcopy
 from .loader import MODULE_TYPES

-from typing import Any, List
+from typing import Any, List, Type

 #     configurable_parents = [
 #         Feeder,
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -152,12 +152,12 @@ class ArchivingOrchestrator:
        if not modules:
            modules = available_modules(with_manifest=True)

+        module: Module
        for module in modules:
            if not module.configs:
                # this module has no configs, don't show anything in the help
                # (TODO: do we want to show something about this module though, like a description?)
                continue
-
            group = parser.add_argument_group(module.display_name or module.name, f"{module.description[:100]}...")
            for name, kwargs in module.configs.items():
                # TODO: go through all the manifests and make sure we're not breaking anything with removing cli_set
@ -165,8 +165,8 @@ class ArchivingOrchestrator:
                kwargs.pop('cli_set', None)
                kwargs['dest'] = f"{module.name}.{kwargs.pop('dest', name)}"
                try:
-                    kwargs['type'] = getattr(__builtins__, kwargs.get('type', 'str'))
-                except AttributeError:
+                    kwargs['type'] = __builtins__.get(kwargs.get('type'), str)
+                except KeyError:
                    kwargs['type'] = getattr(validators, kwargs['type'])
                group.add_argument(f"--{module.name}.{name}", **kwargs)

@ -207,7 +207,7 @@ class ArchivingOrchestrator:
                    exit()

                if (module_type == 'feeder' or module_type == 'formatter') and len(step_items) > 1:
-                    logger.error(f"Only one feeder is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
+                    logger.error(f"Only one {module_type} is allowed, found {len(step_items)} {module_type}s. Please remove one of the following from your configuration file: {modules_to_load}")
                    exit()

            for i, module in enumerate(modules_to_load):
--- a/src/auto_archiver/modules/cli_feeder/manifest.py
+++ b/src/auto_archiver/modules/cli_feeder/manifest.py
@ -5,10 +5,12 @@
    "external_dependencies": {
        "python": ["loguru"],
    },
+    'entry_point': 'cli_feeder::CLIFeeder',
    "configs": {
        "urls": {
            "default": None,
            "help": "URL(s) to archive, either a single URL or a list of urls, should not come from config.yaml",
+            "nargs": "+",
        },
    },
    "description": """
--- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py
+++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
@ -7,12 +7,6 @@ from auto_archiver.core import Metadata, ArchivingContext
 class CLIFeeder(Feeder):
    name = "cli_feeder"

-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        if type(self.urls) != list or len(self.urls) == 0:
-            raise Exception("CLI Feeder did not receive any URL to process")
-
    def __iter__(self) -> Metadata:
        for url in self.urls:
            logger.debug(f"Processing {url}")
--- a/src/auto_archiver/modules/console_db/console_db.py
+++ b/src/auto_archiver/modules/console_db/console_db.py
@ -8,11 +8,6 @@ class ConsoleDb(Database):
    """
        Outputs results to the console
    """
-    name = "console_db"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)

    def started(self, item: Metadata) -> None:
        logger.warning(f"STARTED {item}")
--- a/src/auto_archiver/modules/csv_db/manifest.py
+++ b/src/auto_archiver/modules/csv_db/manifest.py
@ -4,6 +4,7 @@
    "requires_setup": False,
    "external_dependencies": {"python": ["loguru"]
                              },
+    'entry_point': 'csv_db::CSVDb',
    "configs": {
            "csv_file": {"default": "db.csv", "help": "CSV file name"}
        },
--- a/src/auto_archiver/modules/csv_db/csv_db.py
+++ b/src/auto_archiver/modules/csv_db/csv_db.py
@ -11,13 +11,6 @@ class CSVDb(Database):
    """
        Outputs results to a CSV file
    """
-    name = "csv_db"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        self.assert_valid_string("csv_file")
-

    def done(self, item: Metadata, cached: bool=False) -> None:
        """archival result ready - should be saved to DB"""
--- a/src/auto_archiver/modules/csv_feeder/manifest.py
+++ b/src/auto_archiver/modules/csv_feeder/manifest.py
@ -6,6 +6,8 @@
        "python": ["loguru"],
        "bin": [""]
    },
+    'requires_setup': True,
+    'entry_point': "csv_feeder::CSVFeeder",
    "configs": {
            "files": {
                "default": None,
--- a/src/auto_archiver/modules/generic_extractor/manifest.py
+++ b/src/auto_archiver/modules/generic_extractor/manifest.py
@ -2,7 +2,7 @@
    'name': 'Generic Extractor',
    'version': '0.1.0',
    'author': 'Bellingcat',
-    'type': ['extractor', 'feeder', 'enricher'],
+    'type': ['extractor'],
    'requires_setup': False,
    'dependencies': {
        'python': ['yt_dlp', 'requests', 'loguru', 'slugify'],
--- a/src/auto_archiver/modules/gsheet_feeder/manifest.py
+++ b/src/auto_archiver/modules/gsheet_feeder/manifest.py
@ -1,7 +1,7 @@
 {
    "name": "Google Sheets Feeder",
    "type": ["feeder"],
-    "entry_point": "GsheetsFeeder",
+    "entry_point": "gsheet_feeder::GsheetsFeeder",
    "requires_setup": True,
    "external_dependencies": {
        "python": ["loguru", "gspread", "python-slugify"],
--- a/src/auto_archiver/modules/hash_enricher/manifest.py
+++ b/src/auto_archiver/modules/hash_enricher/manifest.py
@ -8,7 +8,10 @@
    "configs": {
            "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
            # TODO add non-negative requirement to match previous implementation?
-            "chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
+            "chunksize": {"default": 1.6e7,
+                          "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB",
+                          'type': 'positive_number',
+                          },
        },
    "description": """
 Generates cryptographic hashes for media files to ensure data integrity and authenticity.
--- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py
+++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py
@ -18,36 +18,6 @@ class HashEnricher(Enricher):
    """
    Calculates hashes for Media instances
    """
-    name = "hash_enricher"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        algos = self.configs()["algorithm"]
-        algo_choices = algos["choices"]
-        if not getattr(self, 'algorithm', None):
-            if not config.get('algorithm'):
-                logger.warning(f"No hash algorithm selected, defaulting to {algos['default']}")
-                self.algorithm = algos["default"]
-            else:
-                self.algorithm = config["algorithm"]
-
-        assert self.algorithm in algo_choices, f"Invalid hash algorithm selected, must be one of {algo_choices} (you selected {self.algorithm})."
-
-        if not getattr(self, 'chunksize', None):
-            if config.get('chunksize'):
-                self.chunksize = config["chunksize"]
-            else:
-                self.chunksize = self.configs()["chunksize"]["default"]
-
-        try:
-            self.chunksize = int(self.chunksize)
-        except ValueError:
-            raise ValueError(f"Invalid chunksize value: {self.chunksize}. Must be an integer.")
-
-        assert self.chunksize >= -1, "read length must be non-negative or -1"
-
-        ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)

    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@ -16,17 +16,17 @@ from auto_archiver.utils.misc import random_str

@dataclass
 class HtmlFormatter(Formatter):
-    name = "html_formatter"

-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
-        # JinjaHelper class static methods are added as filters
-        self.environment.filters.update({
-            k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
-        })
-        self.template = self.environment.get_template("html_template.html")
+    # TODO: fix setting up template with new config method
+    # def __init__(self, config: dict) -> None:
+    #     # without this STEP.__init__ is not called
+    #     super().__init__(config)
+    #     self.environment = Environment(loader=FileSystemLoader(os.path.join(pathlib.Path(__file__).parent.resolve(), "templates/")), autoescape=True)
+    #     # JinjaHelper class static methods are added as filters
+    #     self.environment.filters.update({
+    #         k: v.__func__ for k, v in JinjaHelpers.__dict__.items() if isinstance(v, staticmethod)
+    #     })
+    #     self.template = self.environment.get_template("html_template.html")

    def format(self, item: Metadata) -> Media:
        url = item.get_url()
--- a/src/auto_archiver/modules/local_storage/init.py
+++ b/src/auto_archiver/modules/local_storage/init.py
@ -1 +1 @@
-from .local import LocalStorage
+from .local_storage import LocalStorage
--- a/src/auto_archiver/modules/local_storage/local_storage.py
+++ b/src/auto_archiver/modules/local_storage/local_storage.py
@ -11,9 +11,10 @@ from auto_archiver.base_processors import Storage
 class LocalStorage(Storage):
    name = "local_storage"

-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-        os.makedirs(self.save_to, exist_ok=True)
+    def __init__(self) -> None:
+        super().__init__()
+        # TODO: fix up passing config values to 'steps'
+        # os.makedirs(self.save_to, exist_ok=True)

    def get_cdn_url(self, media: Media) -> str:
        # TODO: is this viable with Storage.configs on path/filename?
--- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py
+++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py
@ -10,12 +10,6 @@ class MetaEnricher(Enricher):
    """
    Adds metadata information about the archive operations, to be included at the end of all enrichments
    """
-    name = "meta_enricher"
-
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)

    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
--- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py
+++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py
@ -10,11 +10,6 @@ class MetadataEnricher(Enricher):
    """
    Extracts metadata information from files using exiftool.
    """
-    name = "metadata_enricher"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)


    def enrich(self, to_enrich: Metadata) -> None:
--- a/src/auto_archiver/modules/mute_formatter/manifest.py
+++ b/src/auto_archiver/modules/mute_formatter/manifest.py
@ -1,7 +1,7 @@
-m = {
+{
    "name": "Mute Formatter",
    "type": ["formatter"],
-    "requires_setup": False,
+    "requires_setup": True,
    "external_dependencies": {
    },
    "description": """ Default formatter.
--- a/src/auto_archiver/modules/mute_formatter/mute_formatter.py
+++ b/src/auto_archiver/modules/mute_formatter/mute_formatter.py
@ -1,16 +1,12 @@
 from __future__ import annotations
 from dataclasses import dataclass

-from ..core import Metadata, Media
-from . import Formatter
+from auto_archiver.core import Metadata, Media
+from auto_archiver.base_processors import Formatter


@dataclass
 class MuteFormatter(Formatter):
    name = "mute_formatter"

-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-
    def format(self, item: Metadata) -> Media: return None
--- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py
+++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py
@ -25,11 +25,6 @@ class PdqHashEnricher(Enricher):
    Calculates perceptual hashes for Media instances using PDQ, allowing for (near-)duplicate detection.
    Ideally this enrichment is orchestrated to run after the thumbnail_enricher.
    """
-    name = "pdq_hash_enricher"
-
-    def __init__(self, config: dict) -> None:
-        # Without this STEP.__init__ is not called
-        super().__init__(config)

    def enrich(self, to_enrich: Metadata) -> None:
        url = to_enrich.get_url()
--- a/src/auto_archiver/modules/s3_storage/manifest.py
+++ b/src/auto_archiver/modules/s3_storage/manifest.py
@ -20,7 +20,7 @@
        "region": {"default": None, "help": "S3 region name"},
        "key": {"default": None, "help": "S3 API key"},
        "secret": {"default": None, "help": "S3 API secret"},
-        "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
+        "random_no_duplicate": {"default": False, "help": "if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `no-dups/`"},
        "endpoint_url": {
            "default": 'https://{region}.digitaloceanspaces.com',
            "help": "S3 bucket endpoint, {region} are inserted at runtime"
--- a/src/auto_archiver/modules/ssl_enricher/manifest.py
+++ b/src/auto_archiver/modules/ssl_enricher/manifest.py
@ -5,6 +5,7 @@
    "external_dependencies": {
        "python": ["loguru", "python-slugify"],
    },
+    'entry_point': 'ssl_enricher::SSLEnricher',
    "configs": {
        "skip_when_nothing_archived": {"default": True, "help": "if true, will skip enriching when no media is archived"},
    },
--- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
+++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
@ -11,11 +11,6 @@ class SSLEnricher(Enricher):
    """
    Retrieves SSL certificate information for a domain, as a file
    """
-    name = "ssl_enricher"
-
-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-        self.skip_when_nothing_archived = bool(self.skip_when_nothing_archived)

    def enrich(self, to_enrich: Metadata) -> None:
        if not to_enrich.media and self.skip_when_nothing_archived: return
--- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py
+++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py
@ -11,11 +11,6 @@ class TelegramExtractor(Extractor):
    Extractor for telegram that does not require login, but the telethon_extractor is much more advised,
    will only return if at least one image or one video is found
    """
-    name = "telegram_extractor"
-
-    def __init__(self, config: dict) -> None:
-        super().__init__(config)
-

    def download(self, item: Metadata) -> Metadata:
        url = item.get_url()
--- a/src/auto_archiver/modules/telethon_extractor/manifest.py
+++ b/src/auto_archiver/modules/telethon_extractor/manifest.py
@ -1,4 +1,3 @@
-import json
 {
    "name": "telethon_extractor",
    "type": ["extractor"],
@ -42,4 +41,4 @@ To use the `TelethonExtractor`, you must configure the following:
 - **Channel Invites**: Optional, specify a JSON string of invite links to join channels during setup.

 """
-}
+}
--- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
@ -18,13 +18,6 @@ class ThumbnailEnricher(Enricher):
    """
    Generates thumbnails for all the media
    """
-    name = "thumbnail_enricher"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        self.thumbnails_per_second = int(self.thumbnails_per_minute) / 60
-        self.max_thumbnails = int(self.max_thumbnails)
    
    def enrich(self, to_enrich: Metadata) -> None:
        """