More manifests, base modules and rename from archiver to extractor.

2025-01-23 16:40:48 +00:00 · 2025-01-23 16:40:48 +00:00 · 1274a1b231
commit 1274a1b231
--- a/15
+++ b/15
@ -1,4 +1,4 @@
-FROM webrecorder/browsertrix-crawler:1.0.4 AS base
+FROM webrecorder/browsertrix-crawler:1.4.2 AS base

 ENV RUNNING_IN_DOCKER=1 \
    LANG=C.UTF-8 \
@ -22,28 +22,30 @@ RUN add-apt-repository ppa:mozillateam/ppa && \


 # Poetry and runtime
-FROM base AS runtime
+FROM base AS poetry-env

 ENV POETRY_NO_INTERACTION=1 \
    POETRY_VIRTUALENVS_IN_PROJECT=1 \
    POETRY_VIRTUALENVS_CREATE=1


-RUN pip install --upgrade pip && \
-    pip install "poetry>=2.0.0,<3.0.0"
+# Create a virtual environment for poetry and install it
+RUN python3 -m venv /poetry-venv && \
+    /poetry-venv/bin/python -m pip install --upgrade pip && \
+    /poetry-venv/bin/python -m pip install "poetry>=2.0.0,<3.0.0"

 WORKDIR /app


 COPY pyproject.toml poetry.lock README.md ./
 # Copy dependency files and install dependencies (excluding the package itself)
-RUN poetry install --only main --no-root --no-cache
+RUN /poetry-venv/bin/poetry install --only main --no-root --no-cache


 # Copy code: This is needed for poetry to install the package itself,
 # but the environment should be cached from the previous step if toml and lock files haven't changed
 COPY ./src/ .
-RUN poetry install --only main --no-cache
+RUN /poetry-venv/bin/poetry install --only main --no-cache


 # Update PATH to include virtual environment binaries
@ -55,4 +57,3 @@ ENTRYPOINT ["python3", "-m", "auto_archiver"]

 # should be executed with 2 volumes (3 if local_storage is used)
 # docker run --rm -v $PWD/secrets:/app/secrets -v $PWD/local_archive:/app/local_archive aa pipenv run python3 -m auto_archiver --config secrets/orchestration.yaml
-
--- a/src/auto_archiver/archivers/init.py
+++ b/src/auto_archiver/archivers/init.py
@ -1,8 +0,0 @@
-"""
-Archivers are responsible for retrieving the content from various external platforms.
-They act as specialized modules, each tailored to interact with a specific platform,
-service, or data source. The archivers collectively enable the tool to comprehensively
-collect and preserve a variety of content types, such as posts, images, videos and metadata.
-
-"""
-from .archiver import Archiver
--- a/src/auto_archiver/base_modules/init.py
+++ b/src/auto_archiver/base_modules/init.py
@ -0,0 +1,6 @@
+from .database import Database
+from .enricher import Enricher
+from .feeder import Feeder
+from .storage import Storage
+from .extractor import Extractor
+from .formatter import Formatter
--- a/src/auto_archiver/base_modules/database.py
+++ b/src/auto_archiver/base_modules/database.py
@ -3,13 +3,13 @@ from dataclasses import dataclass
 from abc import abstractmethod, ABC
 from typing import Union

-from ..core import Metadata, Step
+from auto_archiver.core import Metadata, Step


@dataclass
 class Database(Step, ABC):
-    name = "database"

+    name = "database"
    def __init__(self, config: dict) -> None:
        # without this STEP.__init__ is not called
        super().__init__(config)
--- a/src/auto_archiver/base_modules/enricher.py
+++ b/src/auto_archiver/base_modules/enricher.py
@ -0,0 +1,31 @@
+"""
+Enrichers are modular components that enhance archived content by adding
+context, metadata, or additional processing.
+
+These add additional information to the context, such as screenshots, hashes, and metadata.
+They are designed to work within the archiving pipeline, operating on `Metadata` objects after
+the archiving step and before storage or formatting.
+
+Enrichers are optional but highly useful for making the archived data more powerful.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from abc import abstractmethod, ABC
+from auto_archiver.core import Metadata, Step
+
+@dataclass
+class Enricher(Step, ABC):
+    """Base classes and utilities for enrichers in the Auto-Archiver system."""
+    name = "enricher"
+
+    def __init__(self, config: dict) -> None:
+        # without this STEP.__init__ is not called
+        super().__init__(config)
+        
+
+    # only for typing...
+    def init(name: str, config: dict) -> Enricher:
+        return Step.init(name, config, Enricher)
+
+    @abstractmethod
+    def enrich(self, to_enrich: Metadata) -> None: pass
--- a/src/auto_archiver/base_modules/extractor.py
+++ b/src/auto_archiver/base_modules/extractor.py
@ -1,7 +1,7 @@
-""" The `archiver` module defines the base functionality for implementing archivers in the media archiving framework.
-    This class provides common utility methods and a standard interface for archivers.
+""" The `extractor` module defines the base functionality for implementing extractors in the media archiving framework.
+    This class provides common utility methods and a standard interface for extractors.

-    Factory method to initialize an archiver instance based on its name.
+    Factory method to initialize an extractor instance based on its name.


 """
@ -15,32 +15,32 @@ import mimetypes, requests
 from loguru import logger
 from retrying import retry

-from ..core import Metadata, Step, ArchivingContext
+from ..core import Metadata, ArchivingContext


@dataclass
-class Archiver:
+class Extractor:
    """
-    Base class for implementing archivers in the media archiving framework.
+    Base class for implementing extractors in the media archiving framework.
    Subclasses must implement the `download` method to define platform-specific behavior.
    """

    def setup(self) -> None:
-        # used when archivers need to login or do other one-time setup
+        # used when extractors need to login or do other one-time setup
        pass

    def cleanup(self) -> None:
-        # called when archivers are done, or upon errors, cleanup any resources
+        # called when extractors are done, or upon errors, cleanup any resources
        pass

    def sanitize_url(self, url: str) -> str:
        # used to clean unnecessary URL parameters OR unfurl redirect links
        return url
-    
+
    def suitable(self, url: str) -> bool:
        """
-        Returns True if this archiver can handle the given URL
-        
+        Returns True if this extractor can handle the given URL
+
        Should be overridden by subclasses
        """
        return True
@ -84,10 +84,10 @@ class Archiver:
                for chunk in d.iter_content(chunk_size=8192):
                    f.write(chunk)
            return to_filename
-        
+
        except requests.RequestException as e:
            logger.warning(f"Failed to fetch the Media URL: {e}")

    @abstractmethod
    def download(self, item: Metadata) -> Metadata:
-        pass
+        pass
--- a/src/auto_archiver/base_modules/feeder.py
+++ b/src/auto_archiver/base_modules/feeder.py
@ -1,8 +1,8 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from abc import abstractmethod
-from ..core import Metadata
-from ..core import Step
+from auto_archiver.core import Metadata
+from auto_archiver.core import Step


@dataclass
--- a/src/auto_archiver/base_modules/formatter.py
+++ b/src/auto_archiver/base_modules/formatter.py
@ -1,7 +1,7 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from abc import abstractmethod
-from ..core import Metadata, Media, Step
+from auto_archiver.core import Metadata, Media, Step


@dataclass
--- a/src/auto_archiver/base_modules/storage.py
+++ b/src/auto_archiver/base_modules/storage.py
@ -4,10 +4,10 @@ from dataclasses import dataclass
 from typing import IO, Optional
 import os

-from ..utils.misc import random_str
+from auto_archiver.utils.misc import random_str

-from ..core import Media, Step, ArchivingContext, Metadata
-from ..enrichers import HashEnricher
+from auto_archiver.core import Media, Step, ArchivingContext, Metadata
+from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
 from loguru import logger
 from slugify import slugify

--- a/src/auto_archiver/core/init.py
+++ b/src/auto_archiver/core/init.py
@ -8,9 +8,4 @@ from .context import ArchivingContext

 # cannot import ArchivingOrchestrator/Config to avoid circular dep
 # from .orchestrator import ArchivingOrchestrator
-# from .config import Config
-
-from .media import Media
-from .step import Step
-from .context import ArchivingContext
-from .metadata import Metadata
+# from .config import Config
--- a/src/auto_archiver/core/config.py
+++ b/src/auto_archiver/core/config.py
@ -15,7 +15,7 @@ from .loader import MODULE_TYPES
 #     configurable_parents = [
 #         Feeder,
 #         Enricher,
-#         Archiver,
+#         Extractor,
 #         Database,
 #         Storage,
 #         Formatter
@ -23,7 +23,7 @@ from .loader import MODULE_TYPES
 #     ]
 #     feeder: Feeder
 #     formatter: Formatter
-#     archivers: List[Archiver] = field(default_factory=[])
+#     extractors: List[Extractor] = field(default_factory=[])
 #     enrichers: List[Enricher] = field(default_factory=[])
 #     storages: List[Storage] = field(default_factory=[])
 #     databases: List[Database] = field(default_factory=[])
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -33,7 +33,7 @@ class ArchivingOrchestrator:
    #     self.feeder: Feeder = config.feeder
    #     self.formatter: Formatter = config.formatter
    #     self.enrichers: List[Enricher] = config.enrichers
-    #     self.archivers: List[Archiver] = config.archivers
+    #     self.extractors: List[Extractor] = config.extractors
    #     self.databases: List[Database] = config.databases
    #     self.storages: List[Storage] = config.storages
    #     ArchivingContext.set("storages", self.storages, keep_on_reset=True)
@ -80,7 +80,7 @@ class ArchivingOrchestrator:
            for module_type in MODULE_TYPES:
                enabled_modules.extend(yaml_config['steps'].get(f"{module_type}s", []))

-            # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'archivers', 'databases', 'storages', 'formatter'
+            # add in any extra modules that have been passed on the command line for 'feeders', 'enrichers', 'extractors', 'databases', 'storages', 'formatter'
            for module_type in MODULE_TYPES:
                if modules := getattr(basic_config, f"{module_type}s", []):
                    enabled_modules.extend(modules)
@ -98,7 +98,7 @@ class ArchivingOrchestrator:
            self.add_module_args(available_modules(with_manifest=True), parser)
        

-        breakpoint()
+        # breakpoint()
        parser.set_defaults(**to_dot_notation(yaml_config))

        # reload the parser with the new arguments, now that we have them
@ -165,7 +165,8 @@ class ArchivingOrchestrator:
            
        for module_type in MODULE_TYPES:
            if module_type == 'enricher':
-                breakpoint()
+                pass
+                # breakpoint()
            step_items = []
            modules_to_load = self.config['steps'][f"{module_type}s"]

@ -228,7 +229,7 @@ class ArchivingOrchestrator:
    def cleanup(self)->None:
        logger.info("Cleaning up")
        for e in self.config['steps']['extractors']:
-            breakpoint()
+            # breakpoint()
            e.cleanup()

    def feed(self) -> Generator[Metadata]:
--- a/src/auto_archiver/databases/init.py
+++ b/src/auto_archiver/databases/init.py
@ -1,5 +0,0 @@
-""" Databases are used to store the outputs from running the Autp Archiver.
-
-
-"""
-from .database import Database
--- a/src/auto_archiver/enrichers/init.py
+++ b/src/auto_archiver/enrichers/init.py
@ -1,12 +0,0 @@
-"""
-Enrichers are modular components that enhance archived content by adding
-context, metadata, or additional processing.
-
-These add additional information to the context, such as screenshots, hashes, and metadata.
-They are designed to work within the archiving pipeline, operating on `Metadata` objects after
-the archiving step and before storage or formatting.
-
-Enrichers are optional but highly useful for making the archived data more powerful.
-
-
-"""
--- a/src/auto_archiver/enrichers/enricher.py
+++ b/src/auto_archiver/enrichers/enricher.py
@ -1,22 +0,0 @@
-""" Base classes and utilities for enrichers in the Auto-Archiver system.
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-from abc import abstractmethod, ABC
-from ..core import Metadata, Step
-
-@dataclass
-class Enricher(Step, ABC):
-    name = "enricher"
-
-    def __init__(self, config: dict) -> None:
-        # without this STEP.__init__ is not called
-        super().__init__(config)
-        
-
-    # only for typing...
-    def init(name: str, config: dict) -> Enricher:
-        return Step.init(name, config, Enricher)
-
-    @abstractmethod
-    def enrich(self, to_enrich: Metadata) -> None: pass
--- a/src/auto_archiver/feeders/init.py
+++ b/src/auto_archiver/feeders/init.py
@ -1,3 +0,0 @@
-""" Feeders handle the input of media into the Auto Archiver.
-
-"""
--- a/src/auto_archiver/formatters/init.py
+++ b/src/auto_archiver/formatters/init.py
@ -1 +0,0 @@
-""" Formatters for the output of the content. """
--- a/src/auto_archiver/modules/api_db/api_db.py
+++ b/src/auto_archiver/modules/api_db/api_db.py
@ -2,7 +2,7 @@ from typing import Union
 import requests, os
 from loguru import logger

-from auto_archiver.databases import Database
+from auto_archiver.base_modules import Database
 from auto_archiver.core import Metadata


--- a/src/auto_archiver/formatters/templates/init.py
+++ b/src/auto_archiver/formatters/templates/init.py
--- a/src/auto_archiver/modules/atlos/manifest.py
+++ b/src/auto_archiver/modules/atlos/manifest.py
@ -0,0 +1,38 @@
+{
+    "name": "atlos_storage",
+    "type": ["storage"],
+    "requires_setup": True,
+    "external_dependencies": {
+        "python": ["loguru", "requests"],
+        "bin": [""]
+    },
+    "configs": {
+        # TODO: get base storage configs
+        # TODO also? get_atlos_config_options()
+
+        "api_token": {
+            "default": None,
+            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
+            "cli_set": lambda cli_val, _: cli_val
+        },
+        "atlos_url": {
+            "default": "https://platform.atlos.org",
+            "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
+            "cli_set": lambda cli_val, _: cli_val
+        },
+    },
+    "description": """
+    AtlosStorage: A storage module for saving media files to the Atlos platform.
+
+    ### Features
+    - Uploads media files to Atlos using Atlos-specific APIs.
+    - Automatically calculates SHA-256 hashes of media files for integrity verification.
+    - Skips uploads for files that already exist on Atlos with the same hash.
+    - Supports attaching metadata, such as `atlos_id`, to the uploaded files.
+    - Provides CDN-like URLs for accessing uploaded media.
+
+    ### Notes
+    - Requires Atlos API configuration, including `atlos_url` and `api_token`.
+    - Files are linked to an `atlos_id` in the metadata, ensuring proper association with Atlos source materials.
+    """
+}
--- a/src/auto_archiver/modules/atlos/atlos.py
+++ b/src/auto_archiver/modules/atlos/atlos.py
@ -4,9 +4,9 @@ from loguru import logger
 import requests
 import hashlib

-from ..core import Media, Metadata
-from ..storages import Storage
-from ..utils import get_atlos_config_options
+from auto_archiver.core import Media, Metadata
+from auto_archiver.base_modules import Storage
+from auto_archiver.utils import get_atlos_config_options


 class AtlosStorage(Storage):
--- a/src/auto_archiver/modules/atlos_db/atlos_db.py
+++ b/src/auto_archiver/modules/atlos_db/atlos_db.py
@ -1,11 +1,12 @@
 import os
+
 from typing import Union
 from loguru import logger
 from csv import DictWriter
 from dataclasses import asdict
 import requests

-from auto_archiver.databases import Database
+from auto_archiver.base_modules import Database
 from auto_archiver.core import Metadata
 from auto_archiver.utils import get_atlos_config_options

--- a/src/auto_archiver/modules/atlos_db/base_configs.py
+++ b/src/auto_archiver/modules/atlos_db/base_configs.py
@ -0,0 +1,13 @@
+def get_atlos_config_options():
+    return {
+        "api_token": {
+            "default": None,
+            "help": "An Atlos API token. For more information, see https://docs.atlos.org/technical/api/",
+            "cli_set": lambda cli_val, _: cli_val
+        },
+        "atlos_url": {
+            "default": "https://platform.atlos.org",
+            "help": "The URL of your Atlos instance (e.g., https://platform.atlos.org), without a trailing slash.",
+            "cli_set": lambda cli_val, _: cli_val
+        },
+    }
--- a/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
+++ b/src/auto_archiver/modules/atlos_feeder/atlos_feeder.py
@ -1,7 +1,7 @@
 from loguru import logger
 import requests

-from auto_archiver.feeders import Feeder
+from auto_archiver.base_modules import Feeder
 from auto_archiver.core import Metadata, ArchivingContext
 from auto_archiver.utils import get_atlos_config_options

--- a/src/auto_archiver/modules/cli_feeder/cli_feeder.py
+++ b/src/auto_archiver/modules/cli_feeder/cli_feeder.py
@ -1,6 +1,6 @@
 from loguru import logger

-from auto_archiver.feeders import Feeder
+from auto_archiver.base_modules import Feeder
 from auto_archiver.core import Metadata, ArchivingContext


--- a/src/auto_archiver/modules/console_db/console_db.py
+++ b/src/auto_archiver/modules/console_db/console_db.py
@ -1,6 +1,6 @@
 from loguru import logger

-from auto_archiver.databases import Database
+from auto_archiver.base_modules import Database
 from auto_archiver.core import Metadata


--- a/src/auto_archiver/modules/csv_db/csv_db.py
+++ b/src/auto_archiver/modules/csv_db/csv_db.py
@ -3,7 +3,7 @@ from loguru import logger
 from csv import DictWriter
 from dataclasses import asdict

-from auto_archiver.databases import Database
+from auto_archiver.base_modules import Database
 from auto_archiver.core import Metadata


--- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py
+++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
@ -1,7 +1,7 @@
 from loguru import logger
 import csv

-from auto_archiver.feeders import Feeder
+from auto_archiver.base_modules import Feeder
 from auto_archiver.core import Metadata, ArchivingContext
 from auto_archiver.utils import url_or_none

--- a/src/auto_archiver/modules/gdrive_storage/init.py
+++ b/src/auto_archiver/modules/gdrive_storage/init.py
--- a/src/auto_archiver/modules/gdrive_storage/manifest.py
+++ b/src/auto_archiver/modules/gdrive_storage/manifest.py
@ -0,0 +1,34 @@
+m = {
+    "name": "Google Drive Storage",
+    "type": ["storage"],
+    "requires_setup": True,
+    "external_dependencies": {
+        "python": [
+            "loguru",
+            "google-api-python-client",
+            "google-auth",
+            "google-auth-oauthlib",
+            "google-auth-httplib2"
+        ],
+    },
+    "configs": {
+        # TODO: get base storage configs
+        "root_folder_id": {"default": None, "help": "root google drive folder ID to use as storage, found in URL: 'https://drive.google.com/drive/folders/FOLDER_ID'"},
+        "oauth_token": {"default": None, "help": "JSON filename with Google Drive OAuth token: check auto-archiver repository scripts folder for create_update_gdrive_oauth_token.py. NOTE: storage used will count towards owner of GDrive folder, therefore it is best to use oauth_token_filename over service_account."},
+        "service_account": {"default": "secrets/service_account.json", "help": "service account JSON file path, same as used for Google Sheets. NOTE: storage used will count towards the developer account."},
+    },
+    "description": """
+    GDriveStorage: A storage module for saving archived content to Google Drive.
+
+    ### Features
+    - Saves media files to Google Drive, organizing them into folders based on the provided path structure.
+    - Supports OAuth token-based authentication or service account credentials for API access.
+    - Automatically creates folders in Google Drive if they don't exist.
+    - Retrieves CDN URLs for stored files, enabling easy sharing and access.
+
+    ### Notes
+    - Requires setup with either a Google OAuth token or a service account JSON file.
+    - Files are uploaded to the specified `root_folder_id` and organized by the `media.key` structure.
+    - Automatically handles Google Drive API token refreshes for long-running jobs.
+    """
+}
--- a/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
+++ b/src/auto_archiver/modules/gdrive_storage/gdrive_storage.py
@ -9,8 +9,8 @@ from google.oauth2 import service_account
 from google.oauth2.credentials import Credentials
 from google.auth.transport.requests import Request

-from ..core import Media
-from . import Storage
+from auto_archiver.core import Media
+from auto_archiver.base_modules import Storage


 class GDriveStorage(Storage):
--- a/src/auto_archiver/modules/generic_extractor/bluesky.py
+++ b/src/auto_archiver/modules/generic_extractor/bluesky.py
@ -1,17 +1,12 @@
-import os
-import mimetypes
-
-import requests
 from loguru import logger

-from auto_archiver.core.context import ArchivingContext
-from auto_archiver.archivers.archiver import Archiver
+from auto_archiver.base_modules.extractor import Extractor
 from auto_archiver.core.metadata import Metadata, Media
 from .dropin import GenericDropin, InfoExtractor

 class Bluesky(GenericDropin):

-    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        result = Metadata()
        result.set_url(url)
        result.set_title(post["record"]["text"])
@ -42,7 +37,7 @@ class Bluesky(GenericDropin):



-    def _download_bsky_embeds(self, post: dict, archiver: Archiver) -> list[Media]:
+    def _download_bsky_embeds(self, post: dict, archiver: Extractor) -> list[Media]:
        """
        Iterates over image(s) or video in a Bluesky post and downloads them        
        """
--- a/src/auto_archiver/modules/generic_extractor/dropin.py
+++ b/src/auto_archiver/modules/generic_extractor/dropin.py
@ -1,6 +1,6 @@
 from yt_dlp.extractor.common import InfoExtractor
 from auto_archiver.core.metadata import Metadata
-from auto_archiver.archivers.archiver import Archiver
+from auto_archiver.base_modules.extractor import Extractor

 class GenericDropin:
    """Base class for dropins for the generic extractor.
@ -30,7 +30,7 @@ class GenericDropin:
        raise NotImplementedError("This method should be implemented in the subclass")
    

-    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        """
        This method should create a Metadata object from the post data.
        """
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@ -5,10 +5,10 @@ from yt_dlp.extractor.common import InfoExtractor

 from loguru import logger

-from auto_archiver.archivers.archiver import Archiver
+from auto_archiver.base_modules.extractor import Extractor
 from ...core import Metadata, Media, ArchivingContext

-class GenericExtractor(Archiver):
+class GenericExtractor(Extractor):
    name = "youtubedl_archiver" #left as is for backwards compat
    _dropins = {}

--- a/src/auto_archiver/modules/generic_extractor/truth.py
+++ b/src/auto_archiver/modules/generic_extractor/truth.py
@ -2,7 +2,7 @@ from typing import Type

 from auto_archiver.utils import traverse_obj
 from auto_archiver.core.metadata import Metadata, Media
-from auto_archiver.archivers.archiver import Archiver
+from auto_archiver.base_modules.extractor import Extractor
 from yt_dlp.extractor.common import InfoExtractor

 from dateutil.parser import parse as parse_dt
@ -19,7 +19,7 @@ class Truth(GenericDropin):
    def skip_ytdlp_download(self, url, ie_instance: Type[InfoExtractor]) -> bool:
        return True

-    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+    def create_metadata(self, post: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        """
        Creates metadata from a truth social post
        
--- a/src/auto_archiver/modules/generic_extractor/twitter.py
+++ b/src/auto_archiver/modules/generic_extractor/twitter.py
@ -6,7 +6,7 @@ from slugify import slugify

 from auto_archiver.core.metadata import Metadata, Media
 from auto_archiver.utils import UrlUtil
-from auto_archiver.archivers.archiver import Archiver
+from auto_archiver.base_modules.extractor import Extractor

 from .dropin import GenericDropin, InfoExtractor

@ -32,7 +32,7 @@ class Twitter(GenericDropin):
        twid = ie_instance._match_valid_url(url).group('id')
        return ie_instance._extract_status(twid=twid)

-    def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Archiver, url: str) -> Metadata:
+    def create_metadata(self, tweet: dict, ie_instance: InfoExtractor, archiver: Extractor, url: str) -> Metadata:
        result = Metadata()
        try:
            if not tweet.get("user") or not tweet.get("created_at"):
--- a/src/auto_archiver/modules/gsheet_db/manifest.py
+++ b/src/auto_archiver/modules/gsheet_db/manifest.py
@ -1,21 +0,0 @@
-# TODO merge with feeder manifest?
-{
-    "name": "gsheet_db",
-    "type": ["database"],
-    "requires_setup": True,
-    "external_dependencies": {"python": [" loguru"],
-                              },
-    "description": """
-Handles integration with Google Sheets for tracking archival tasks.
-
-### Features
- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
- Skips redundant updates for empty or invalid data fields.
-
-### Notes
- Currently works only with metadata provided by GsheetFeeder. 
- Requires configuration of a linked Google Sheet and appropriate API credentials.
-""",
-}
--- a/src/auto_archiver/modules/gsheet_processor/init.py
+++ b/src/auto_archiver/modules/gsheet_processor/init.py
--- a/src/auto_archiver/modules/gsheet_processor/manifest.py
+++ b/src/auto_archiver/modules/gsheet_processor/manifest.py
@ -1,5 +1,5 @@
 {
-    "name": "Google Sheets Feeder",
+    "name": "Google Sheets Procesor",
    "type": ["feeder"],
    "requires_setup": True,
    "external_dependencies": {
@ -22,7 +22,12 @@
        }
    },
    "description": """
-    GsheetsFeeder: A Google Sheets-based feeder for the Auto Archiver.
+    Google Sheets Module.
+    
+    Handles feeding from a google sheet as well as an optional write back to the sheet.
+    
+    ## GsheetsFeeder 
+    A Google Sheets-based feeder for the Auto Archiver.

    This reads data from Google Sheets and filters rows based on user-defined rules.
    The filtered rows are processed into `Metadata` objects.
@ -36,5 +41,18 @@
    ### Notes
    - Requires a Google Service Account JSON file for authentication. Suggested location is `secrets/gsheets_service_account.json`.
    - Create the sheet using the template provided in the docs.
+    
+    ## GsheetsDatabase:
+    Handles integration with Google Sheets for tracking archival tasks.
+
+### Features
+- Updates a Google Sheet with the status of the archived URLs, including in progress, success or failure, and method used.
+- Saves metadata such as title, text, timestamp, hashes, screenshots, and media URLs to designated columns.
+- Formats media-specific metadata, such as thumbnails and PDQ hashes for the sheet.
+- Skips redundant updates for empty or invalid data fields.
+
+### Notes
+- Currently works only with metadata provided by GsheetFeeder. 
+- Requires configuration of a linked Google Sheet and appropriate API credentials.
    """
 }
--- a/src/auto_archiver/modules/gsheet_processor/gsheet_db.py
+++ b/src/auto_archiver/modules/gsheet_processor/gsheet_db.py
@ -1,10 +1,11 @@
 from typing import Union, Tuple
+
 import datetime
 from urllib.parse import quote

 from loguru import logger

-from auto_archiver.databases import Database
+from auto_archiver.base_modules import Database
 from auto_archiver.core import Metadata, Media, ArchivingContext
 from auto_archiver.utils import GWorksheet

--- a/src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_processor/gsheet_feeder.py
@ -13,8 +13,7 @@ import gspread, os
 from loguru import logger
 from slugify import slugify

-# from . import Enricher
-from auto_archiver.feeders import Feeder
+from auto_archiver.base_modules import Feeder
 from auto_archiver.core import Metadata, ArchivingContext
 from auto_archiver.utils import Gsheets, GWorksheet

--- a/src/auto_archiver/modules/hash_enricher/init.py
+++ b/src/auto_archiver/modules/hash_enricher/init.py
@ -0,0 +1 @@
+from hash_enricher import HashEnricher
--- a/src/auto_archiver/modules/hash_enricher/manifest.py
+++ b/src/auto_archiver/modules/hash_enricher/manifest.py
@ -7,7 +7,7 @@
    },
    "configs": {
            "algorithm": {"default": "SHA-256", "help": "hash algorithm to use", "choices": ["SHA-256", "SHA3-512"]},
-            "chunksize": {"default": int(1.6e7), "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
+            "chunksize": {"default": 1.6e7, "help": "number of bytes to use when reading files in chunks (if this value is too large you will run out of RAM), default is 16MB"},
        },
    "description": """
 Generates cryptographic hashes for media files to ensure data integrity and authenticity.
--- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py
+++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py
@ -10,7 +10,7 @@ making it suitable for handling large files efficiently.
 import hashlib
 from loguru import logger

-from auto_archiver.enrichers import Enricher
+from auto_archiver.base_modules import Enricher
 from auto_archiver.core import Metadata, ArchivingContext


@ -40,7 +40,11 @@ class HashEnricher(Enricher):
            else:
                self.chunksize = self.configs()["chunksize"]["default"]

-        self.chunksize = int(self.chunksize)
+        try:
+            self.chunksize = int(self.chunksize)
+        except ValueError:
+            raise ValueError(f"Invalid chunksize value: {self.chunksize}. Must be an integer.")
+
        assert self.chunksize >= -1, "read length must be non-negative or -1"

        ArchivingContext.set("hash_enricher.algorithm", self.algorithm, keep_on_reset=True)
--- a/src/auto_archiver/modules/instagram_api_archiver/init.py
+++ b/src/auto_archiver/modules/instagram_api_archiver/init.py
--- a/src/auto_archiver/modules/html_formatter/manifest.py
+++ b/src/auto_archiver/modules/html_formatter/manifest.py
@ -0,0 +1,13 @@
+m = {
+    "name": "HTML Formatter",
+    "type": ["formatter"],
+    "requires_setup": False,
+    "external_dependencies": {
+                          "python": ["loguru", "jinja2"],
+                          "bin": [""]
+    },
+    "configs": {
+            "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
+        },
+    "description": """ """,
+}
--- a/src/auto_archiver/modules/html_formatter/html_formatter.py
+++ b/src/auto_archiver/modules/html_formatter/html_formatter.py
@ -7,11 +7,11 @@ from loguru import logger
 import json
 import base64

-from ..version import __version__
-from ..core import Metadata, Media, ArchivingContext
-from . import Formatter
-from ..enrichers import HashEnricher
-from ..utils.misc import random_str
+from auto_archiver.version import __version__
+from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.base_modules import Formatter
+from auto_archiver.modules.hash_enricher import HashEnricher
+from auto_archiver.utils.misc import random_str


@dataclass
@ -28,11 +28,11 @@ class HtmlFormatter(Formatter):
        })
        self.template = self.environment.get_template("html_template.html")

-    @staticmethod
-    def configs() -> dict:
-        return {
-            "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
-        }
+    # @staticmethod
+    # def configs() -> dict:
+    #     return {
+    #         "detect_thumbnails": {"default": True, "help": "if true will group by thumbnails generated by thumbnail enricher by id 'thumbnail_00'"}
+    #     }

    def format(self, item: Metadata) -> Media:
        url = item.get_url()
--- a/src/auto_archiver/modules/html_formatter/templates/init.py
+++ b/src/auto_archiver/modules/html_formatter/templates/init.py
--- a/src/auto_archiver/modules/html_formatter/templates/html_template.html
+++ b/src/auto_archiver/modules/html_formatter/templates/html_template.html
--- a/src/auto_archiver/modules/html_formatter/templates/macros.html
+++ b/src/auto_archiver/modules/html_formatter/templates/macros.html
--- a/src/auto_archiver/modules/instagram_api_extractor/init.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/init.py
--- a/src/auto_archiver/modules/instagram_api_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/manifest.py
@ -1,7 +1,6 @@
 {
-    "name": "Instagram API Archiver",
+    "name": "Instagram API Extractor",
    "type": ["extractor"],
-    "entry_point": "instagram_api_archiver:InstagramApiArchiver",
    "external_dependencies":
        {"python": ["requests",
                    "loguru",
--- a/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py
+++ b/src/auto_archiver/modules/instagram_api_extractor/instagram_api_archiver.py
@ -1,5 +1,5 @@
 """
-The `instagram_api_archiver` module provides tools for archiving various types of Instagram content
+The `instagram_api_extractor` module provides tools for archiving various types of Instagram content
 using the [Instagrapi API](https://github.com/subzeroid/instagrapi).

 Connects to an Instagrapi API deployment and allows for downloading Instagram user profiles,
@ -16,19 +16,19 @@ from loguru import logger
 from retrying import retry
 from tqdm import tqdm

-from auto_archiver.archivers import Archiver
+from auto_archiver.base_modules import Extractor
 from auto_archiver.core import Media
 from auto_archiver.core import Metadata


-class InstagramAPIArchiver(Archiver):
+class InstagramAPIExtractor(Extractor):
    """
    Uses an https://github.com/subzeroid/instagrapi API deployment to fetch instagram posts data

    # TODO: improvement collect aggregates of locations[0].location and mentions for all posts
    """

-    name = "instagram_api_archiver"
+    name = "instagram_api_extractor"

    global_pattern = re.compile(
        r"(?:(?:http|https):\/\/)?(?:www.)?(?:instagram.com)\/(stories(?:\/highlights)?|p|reel)?\/?([^\/\?]*)\/?(\d+)?"
--- a/src/auto_archiver/modules/instagram_extractor/init.py
+++ b/src/auto_archiver/modules/instagram_extractor/init.py
--- a/src/auto_archiver/modules/instagram_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_extractor/manifest.py
@ -1,7 +1,6 @@
 {
-    "name": "Instagram Archiver",
+    "name": "Instagram Extractor",
    "type": ["extractor"],
-    "entry_point": "instagram_archiver:InstagramArchiver",
    "external_dependencies": {
        "python": [
            "instaloader",
--- a/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py
+++ b/src/auto_archiver/modules/instagram_extractor/instagram_archiver.py
@ -7,15 +7,15 @@ import re, os, shutil, traceback
 import instaloader  # https://instaloader.github.io/as-module.html
 from loguru import logger

-from auto_archiver.archivers import Archiver
+from auto_archiver.base_modules import Extractor
 from auto_archiver.core import Metadata
 from auto_archiver.core import Media

-class InstagramArchiver(Archiver):
+class InstagramExtractor(Extractor):
    """
    Uses Instaloader to download either a post (inc images, videos, text) or as much as possible from a profile (posts, stories, highlights, ...)
    """
-    name = "instagram_archiver"
+    name = "instagram_extractor"

    # NB: post regex should be tested before profile
    # https://regex101.com/r/MGPquX/1
@ -67,7 +67,7 @@ class InstagramArchiver(Archiver):
            elif len(profile_matches):
                result = self.download_profile(url, profile_matches[0])
        except Exception as e:
-            logger.error(f"Failed to download with instagram archiver due to: {e}, make sure your account credentials are valid.")
+            logger.error(f"Failed to download with instagram extractor due to: {e}, make sure your account credentials are valid.")
        finally:
            shutil.rmtree(self.download_folder, ignore_errors=True)
        return result
--- a/src/auto_archiver/modules/instagram_tbot_extractor/init.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/init.py
--- a/src/auto_archiver/modules/instagram_tbot_extractor/manifest.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/manifest.py
@ -1,7 +1,6 @@
 {
-    "name": "Instagram Telegram Bot Archiver",
+    "name": "Instagram Telegram Bot Extractor",
    "type": ["extractor"],
-    "entry_point": "instagram_tbot_archiver:InstagramTbotArchiver",
    "external_dependencies": {"python": ["loguru",
                                         "telethon",],
                              },
@ -13,7 +12,7 @@
            "timeout": {"default": 45, "help": "timeout to fetch the instagram content in seconds."},
    },
    "description": """
-The `InstagramTbotArchiver` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
+The `InstagramTbotExtractor` module uses a Telegram bot (`instagram_load_bot`) to fetch and archive Instagram content,
 such as posts and stories. It leverages the Telethon library to interact with the Telegram API, sending Instagram URLs
 to the bot and downloading the resulting media and metadata. The downloaded content is stored as `Media` objects and
 returned as part of a `Metadata` object.
@ -26,7 +25,7 @@ returned as part of a `Metadata` object.

 ### Setup

-To use the `InstagramTbotArchiver`, you need to provide the following configuration settings:
+To use the `InstagramTbotExtractor`, you need to provide the following configuration settings:
 - **API ID and Hash**: Telegram API credentials obtained from [my.telegram.org/apps](https://my.telegram.org/apps).
 - **Session File**: Optional path to store the Telegram session file for future use.

--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_archiver.py
@ -1,5 +1,5 @@
 """
-InstagramTbotArchiver Module
+InstagramTbotExtractor Module

 This module provides functionality to archive Instagram content (posts, stories, etc.) using a Telegram bot (`instagram_load_bot`).
 It interacts with the Telegram API via the Telethon library to send Instagram URLs to the bot, which retrieves the
@ -15,18 +15,18 @@ from sqlite3 import OperationalError
 from loguru import logger
 from telethon.sync import TelegramClient

-from auto_archiver.archivers import Archiver
+from auto_archiver.base_modules import Extractor
 from auto_archiver.core import Metadata, Media, ArchivingContext
 from auto_archiver.utils import random_str


-class InstagramTbotArchiver(Archiver):
+class InstagramTbotExtractor(Extractor):
    """
    calls a telegram bot to fetch instagram posts/stories... and gets available media from it
    https://github.com/adw0rd/instagrapi
    https://t.me/instagram_load_bot
    """
-    name = "instagram_tbot_archiver"
+    name = "instagram_tbot_extractor"

    def __init__(self, config: dict) -> None:
        super().__init__(config)
@ -49,7 +49,7 @@ class InstagramTbotArchiver(Archiver):
        try:
            self.client = TelegramClient(self.session_file, self.api_id, self.api_hash)
        except OperationalError as e:
-            logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_archiver. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")
+            logger.error(f"Unable to access the {self.session_file} session, please make sure you don't use the same session file here and in telethon_extractor. if you do then disable at least one of the archivers for the 1st time you setup telethon session: {e}")

        with self.client.start():
            logger.success(f"SETUP {self.name} login works.")
--- a/src/auto_archiver/modules/twitter_api_archiver/init.py
+++ b/src/auto_archiver/modules/twitter_api_archiver/init.py
--- a/src/auto_archiver/modules/local/manifest.py
+++ b/src/auto_archiver/modules/local/manifest.py
@ -0,0 +1,26 @@
+m = {
+    "name": "Local Storage",
+    "type": ["storage"],
+    "requires_setup": False,
+    "external_dependencies": {
+        "python": ["loguru"],
+    },
+    "configs": {
+        # TODO: get base storage configs
+        "save_to": {"default": "./archived", "help": "folder where to save archived content"},
+        "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
+    },
+    "description": """
+    LocalStorage: A storage module for saving archived content locally on the filesystem.
+
+    ### Features
+    - Saves archived media files to a specified folder on the local filesystem.
+    - Maintains file metadata during storage using `shutil.copy2`.
+    - Supports both absolute and relative paths for stored files, configurable via `save_absolute`.
+    - Automatically creates directories as needed for storing files.
+
+    ### Notes
+    - Default storage folder is `./archived`, but this can be changed via the `save_to` configuration.
+    - The `save_absolute` option can reveal the file structure in output formats; use with caution.
+    """
+}
--- a/src/auto_archiver/modules/local/local.py
+++ b/src/auto_archiver/modules/local/local.py
@ -4,8 +4,8 @@ from typing import IO
 import os
 from loguru import logger

-from ..core import Media
-from ..storages import Storage
+from auto_archiver.core import Media
+from auto_archiver.base_modules import Storage


 class LocalStorage(Storage):
--- a/src/auto_archiver/modules/meta_enricher/meta_enricher.py
+++ b/src/auto_archiver/modules/meta_enricher/meta_enricher.py
@ -2,7 +2,7 @@ import datetime
 import os
 from loguru import logger

-from auto_archiver.enrichers import Enricher
+from auto_archiver.base_modules import Enricher
 from auto_archiver.core import Metadata


--- a/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py
+++ b/src/auto_archiver/modules/metadata_enricher/metadata_enricher.py
@ -2,7 +2,7 @@ import subprocess
 import traceback
 from loguru import logger

-from auto_archiver.enrichers import Enricher
+from auto_archiver.base_modules import Enricher
 from auto_archiver.core import Metadata


--- a/src/auto_archiver/modules/mute_formatter/init.py
+++ b/src/auto_archiver/modules/mute_formatter/init.py
--- a/src/auto_archiver/modules/mute_formatter/manifest.py
+++ b/src/auto_archiver/modules/mute_formatter/manifest.py
@ -0,0 +1,9 @@
+m = {
+    "name": "Mute Formatter",
+    "type": ["formatter"],
+    "requires_setup": False,
+    "external_dependencies": {
+    },
+    "description": """ Default formatter.
+    """,
+}
--- a/src/auto_archiver/modules/mute_formatter/mute_formatter.py
+++ b/src/auto_archiver/modules/mute_formatter/mute_formatter.py
--- a/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py
+++ b/src/auto_archiver/modules/pdq_hash_enricher/pdq_hash_enricher.py
@ -16,7 +16,7 @@ import numpy as np
 from PIL import Image, UnidentifiedImageError
 from loguru import logger

-from auto_archiver.enrichers import Enricher
+from auto_archiver.base_modules import Enricher
 from auto_archiver.core import Metadata


--- a/src/auto_archiver/modules/s3/init.py
+++ b/src/auto_archiver/modules/s3/init.py
--- a/src/auto_archiver/modules/s3/manifest.py
+++ b/src/auto_archiver/modules/s3/manifest.py
@ -0,0 +1,40 @@
+m = {
+    "name": "S3 Storage",
+    "type": ["storage"],
+    "requires_setup": True,
+    "external_dependencies": {
+        "python": ["boto3", "loguru"],
+    },
+    "configs": {
+                # TODO: get base storage configs
+                "bucket": {"default": None, "help": "S3 bucket name"},
+                "region": {"default": None, "help": "S3 region name"},
+                "key": {"default": None, "help": "S3 API key"},
+                "secret": {"default": None, "help": "S3 API secret"},
+                "random_no_duplicate": {"default": False, "help": f"if set, it will override `path_generator`, `filename_generator` and `folder`. It will check if the file already exists and if so it will not upload it again. Creates a new root folder path `{NO_DUPLICATES_FOLDER}`"},
+                "endpoint_url": {
+                    "default": 'https://{region}.digitaloceanspaces.com',
+                    "help": "S3 bucket endpoint, {region} are inserted at runtime"
+                },
+                "cdn_url": {
+                    "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
+                    "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
+                },
+                "private": {"default": False, "help": "if true S3 files will not be readable online"},
+            },
+    "description": """
+    S3Storage: A storage module for saving media files to an S3-compatible object storage.
+
+    ### Features
+    - Uploads media files to an S3 bucket with customizable configurations.
+    - Supports `random_no_duplicate` mode to avoid duplicate uploads by checking existing files based on SHA-256 hashes.
+    - Automatically generates unique paths for files when duplicates are found.
+    - Configurable endpoint and CDN URL for different S3-compatible providers.
+    - Supports both private and public file storage, with public files being readable online.
+
+    ### Notes
+    - Requires S3 credentials (API key and secret) and a bucket name to function.
+    - The `random_no_duplicate` option ensures no duplicate uploads by leveraging hash-based folder structures.
+    - Uses `boto3` for interaction with the S3 API.
+    """
+}
--- a/src/auto_archiver/modules/s3/s3.py
+++ b/src/auto_archiver/modules/s3/s3.py
@ -2,10 +2,11 @@
 from typing import IO
 import boto3, os

-from ..utils.misc import random_str
-from ..core import Media
-from ..storages import Storage
-from ..enrichers import HashEnricher
+from auto_archiver.utils.misc import random_str
+from auto_archiver.core import Media
+from auto_archiver.base_modules import Storage
+# TODO
+from auto_archiver.modules.hash_enricher import HashEnricher
 from loguru import logger

 NO_DUPLICATES_FOLDER = "no-dups/"
--- a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
+++ b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@ -5,7 +5,7 @@ import base64
 from selenium.common.exceptions import TimeoutException


-from auto_archiver.enrichers import Enricher
+from auto_archiver.base_modules import Enricher
 from auto_archiver.utils import Webdriver, UrlUtil, random_str
 from auto_archiver.core import Media, Metadata, ArchivingContext

--- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
+++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
@ -3,7 +3,7 @@ from slugify import slugify
 from urllib.parse import urlparse
 from loguru import logger

-from auto_archiver.enrichers import Enricher
+from auto_archiver.base_modules import Enricher
 from auto_archiver.core import Metadata, ArchivingContext, Media


--- a/src/auto_archiver/modules/telegram_extractor/init.py
+++ b/src/auto_archiver/modules/telegram_extractor/init.py
--- a/src/auto_archiver/modules/telegram_extractor/manifest.py
+++ b/src/auto_archiver/modules/telegram_extractor/manifest.py
@ -1,7 +1,6 @@
 {
-    "name": "Telegram Archiver",
+    "name": "Telegram Extractor",
    "type": ["extractor"],
-    "entry_point": "telegram_archiver:TelegramArchiver",
    "requires_setup": False,
    "external_dependencies": {
        "python": [
@ -11,7 +10,7 @@
        ],
    },
    "description": """ 
-        The `TelegramArchiver` retrieves publicly available media content from Telegram message links without requiring login credentials. 
+        The `TelegramExtractor` retrieves publicly available media content from Telegram message links without requiring login credentials. 
        It processes URLs to fetch images and videos embedded in Telegram messages, ensuring a structured output using `Metadata` 
        and `Media` objects. Recommended for scenarios where login-based archiving is not viable, although `telethon_archiver` 
        is advised for more comprehensive functionality.
--- a/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py
+++ b/src/auto_archiver/modules/telegram_extractor/telegram_extractor.py
@ -2,16 +2,16 @@ import requests, re, html
 from bs4 import BeautifulSoup
 from loguru import logger

-from auto_archiver.archivers import Archiver
+from auto_archiver.base_modules import Extractor
 from auto_archiver.core import Metadata, Media


-class TelegramArchiver(Archiver):
+class TelegramExtractor(Extractor):
    """
-    Archiver for telegram that does not require login, but the telethon_archiver is much more advised,
+    Extractor for telegram that does not require login, but the telethon_extractor is much more advised,
    will only return if at least one image or one video is found
    """
-    name = "telegram_archiver"
+    name = "telegram_extractor"

    def __init__(self, config: dict) -> None:
        super().__init__(config)
--- a/src/auto_archiver/modules/telethon_extractor/init.py
+++ b/src/auto_archiver/modules/telethon_extractor/init.py
--- a/src/auto_archiver/modules/telethon_extractor/manifest.py
+++ b/src/auto_archiver/modules/telethon_extractor/manifest.py
@ -1,8 +1,7 @@
 # TODO rm dependency on json
 {
-    "name": "telethon_archiver",
+    "name": "telethon_extractor",
    "type": ["extractor"],
-    "entry_point": "telethon_archiver:TelethonArchiver",
    "requires_setup": True,
    "external_dependencies": {
        "python": ["telethon",
@ -25,7 +24,7 @@
            }
        },
    "description": """
-The `TelethonArchiver` uses the Telethon library to archive posts and media from Telegram channels and groups. 
+The `TelethonExtractor` uses the Telethon library to archive posts and media from Telegram channels and groups. 
 It supports private and public channels, downloading grouped posts with media, and can join channels using invite links 
 if provided in the configuration. 

@ -37,7 +36,7 @@ if provided in the configuration.
 - Outputs structured metadata and media using `Metadata` and `Media` objects.

 ### Setup
-To use the `TelethonArchiver`, you must configure the following:
+To use the `TelethonExtractor`, you must configure the following:
 - **API ID and API Hash**: Obtain these from [my.telegram.org](https://my.telegram.org/apps).
 - **Session File**: Optional, but records login sessions for future use (default: `secrets/anon.session`).
 - **Bot Token**: Optional, allows access to additional content (e.g., large videos) but limits private channel archiving.
--- a/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py
+++ b/src/auto_archiver/modules/telethon_extractor/telethon_archiver.py
@ -8,13 +8,13 @@ from loguru import logger
 from tqdm import tqdm
 import re, time, json, os

-from auto_archiver.archivers import Archiver
+from auto_archiver.base_modules import Extractor
 from auto_archiver.core import Metadata, Media, ArchivingContext
 from auto_archiver.utils import random_str


-class TelethonArchiver(Archiver):
-    name = "telethon_archiver"
+class TelethonArchiver(Extractor):
+    name = "telethon_extractor"
    link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
    invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)")

--- a/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
+++ b/src/auto_archiver/modules/thumbnail_enricher/thumbnail_enricher.py
@ -9,7 +9,7 @@ and identify important moments without watching the entire video.
 import ffmpeg, os
 from loguru import logger

-from auto_archiver.enrichers import Enricher
+from auto_archiver.base_modules import Enricher
 from auto_archiver.core import Media, Metadata, ArchivingContext
 from auto_archiver.utils.misc import random_str

--- a/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
+++ b/src/auto_archiver/modules/timestamping_enricher/timestamping_enricher.py
@ -8,9 +8,9 @@ from certvalidator import CertificateValidator, ValidationContext
 from asn1crypto import pem
 import certifi

-from auto_archiver.enrichers import Enricher
+from auto_archiver.base_modules import Enricher
 from auto_archiver.core import Metadata, ArchivingContext, Media
-from auto_archiver.archivers import Archiver
+from auto_archiver.base_modules import Extractor


 class TimestampingEnricher(Enricher):
--- a/src/auto_archiver/modules/twitter_api_extractor/init.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/init.py
--- a/src/auto_archiver/modules/twitter_api_extractor/manifest.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/manifest.py
@ -1,7 +1,6 @@
 {
-    "name": "Twitter API Archiver",
+    "name": "Twitter API Extractor",
    "type": ["extractor"],
-    "entry_point": "twitter_api_archiver:TwitterApiArchiver",
    "requires_setup": True,
    "external_dependencies": {
        "python": ["requests",
@ -20,7 +19,7 @@
            "access_secret": {"default": None, "help": "twitter API access_secret"},
        },
    "description": """
-        The `TwitterApiArchiver` fetches tweets and associated media using the Twitter API. 
+        The `TwitterApiExtractor` fetches tweets and associated media using the Twitter API. 
        It supports multiple API configurations for extended rate limits and reliable access. 
        Features include URL expansion, media downloads (e.g., images, videos), and structured output 
        via `Metadata` and `Media` objects. Requires Twitter API credentials such as bearer tokens 
@ -34,7 +33,7 @@
        - Outputs structured metadata and media using `Metadata` and `Media` objects.
        
        ### Setup
-        To use the `TwitterApiArchiver`, you must provide valid Twitter API credentials via configuration:
+        To use the `TwitterApiExtractor`, you must provide valid Twitter API credentials via configuration:
        - **Bearer Token(s)**: A single token or a list for rate-limited API access.
        - **Consumer Key and Secret**: Required for user-authenticated API access.
        - **Access Token and Secret**: Complements the consumer key for enhanced API capabilities.
--- a/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py
+++ b/src/auto_archiver/modules/twitter_api_extractor/twitter_api_archiver.py
@ -8,11 +8,11 @@ from loguru import logger
 from pytwitter import Api
 from slugify import slugify

-from auto_archiver.archivers import Archiver
+from auto_archiver.base_modules import Extractor
 from auto_archiver.core import Metadata,Media

-class TwitterApiArchiver(Archiver):
-    name = "twitter_api_archiver"
+class TwitterApiExtractor(Extractor):
+    name = "twitter_api_extractor"
    link_pattern = re.compile(r"(?:twitter|x).com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)")

    def __init__(self, config: dict) -> None:
--- a/src/auto_archiver/modules/vk_extractor/init.py
+++ b/src/auto_archiver/modules/vk_extractor/init.py
--- a/src/auto_archiver/modules/vk_extractor/manifest.py
+++ b/src/auto_archiver/modules/vk_extractor/manifest.py
@ -1,7 +1,6 @@
 {
-    "name": "VKontakte Archiver",
+    "name": "VKontakte Extractor",
    "type": ["extractor"],
-    "entry_point": "vk_archiver:VKArchiver",
    "requires_setup": True,
    "depends": ["core", "utils"],
    "external_dependencies": {
@ -14,7 +13,7 @@
            "session_file": {"default": "secrets/vk_config.v2.json", "help": "valid VKontakte password"},
        },
    "description": """
-The `VkArchiver` fetches posts, text, and images from VK (VKontakte) social media pages. 
+The `VkExtractor` fetches posts, text, and images from VK (VKontakte) social media pages. 
 This archiver is specialized for `/wall` posts and uses the `VkScraper` library to extract 
 and download content. Note that VK videos are handled separately by the `YTDownloader`.

--- a/src/auto_archiver/modules/vk_extractor/vk_archiver.py
+++ b/src/auto_archiver/modules/vk_extractor/vk_archiver.py
@ -2,16 +2,16 @@ from loguru import logger
 from vk_url_scraper import VkScraper

 from auto_archiver.utils.misc import dump_payload
-from auto_archiver.archivers import Archiver
+from auto_archiver.base_modules import Extractor
 from auto_archiver.core import Metadata, Media, ArchivingContext


-class VkArchiver(Archiver):
+class VkExtractor(Extractor):
    """"
    VK videos are handled by YTDownloader, this archiver gets posts text and images.
    Currently only works for /wall posts
    """
-    name = "vk_archiver"
+    name = "vk_extractor"

    def __init__(self, config: dict) -> None:
        super().__init__(config)
--- a/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
+++ b/src/auto_archiver/modules/wacz_enricher/wacz_enricher.py
@ -6,12 +6,11 @@ from loguru import logger
 from warcio.archiveiterator import ArchiveIterator

 from auto_archiver.core import Media, Metadata, ArchivingContext
-from auto_archiver.enrichers import Enricher
-from auto_archiver.archivers import Archiver
+from auto_archiver.base_modules import Extractor, Enricher
 from auto_archiver.utils import UrlUtil, random_str


-class WaczArchiverEnricher(Enricher, Archiver):
+class WaczExtractorEnricher(Enricher, Extractor):
    """
    Uses https://github.com/webrecorder/browsertrix-crawler to generate a .WACZ archive of the URL
    If used with [profiles](https://github.com/webrecorder/browsertrix-crawler#creating-and-using-browser-profiles)
--- a/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py
+++ b/src/auto_archiver/modules/wayback_enricher/wayback_enricher.py
@ -2,12 +2,11 @@ import json
 from loguru import logger
 import time, requests

-from auto_archiver.enrichers import Enricher
-from auto_archiver.archivers import Archiver
+from auto_archiver.base_modules import Extractor, Enricher
 from auto_archiver.utils import UrlUtil
 from auto_archiver.core import Metadata

-class WaybackArchiverEnricher(Enricher, Archiver):
+class WaybackExtractorEnricher(Enricher, Extractor):
    """
    Submits the current URL to the webarchive and returns a job_id or completed archive.

--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@ -2,9 +2,9 @@ import traceback
 import requests, time
 from loguru import logger

-from auto_archiver.enrichers import Enricher
+from auto_archiver.base_modules import Enricher
 from auto_archiver.core import Metadata, Media, ArchivingContext
-from auto_archiver.storages import S3Storage
+from auto_archiver.modules import S3Storage


 class WhisperEnricher(Enricher):
--- a/src/auto_archiver/storages/init.py
+++ b/src/auto_archiver/storages/init.py
@ -1,3 +0,0 @@
-""" This module contains the storage classes for the auto-archiver.
-
-"""
--- a/tests/archivers/test_archiver_base.py
+++ b/tests/archivers/test_archiver_base.py
@ -1,9 +1,7 @@
 import pytest

-from auto_archiver.core import Metadata
-from auto_archiver.core import Step
 from auto_archiver.core.metadata import Metadata
-from auto_archiver.archivers.archiver import Archiver
+from auto_archiver.base_modules.extractor import Extractor
 class TestArchiverBase(object):

    archiver_class: str = None
@ -13,7 +11,7 @@ class TestArchiverBase(object):
    def setup_archiver(self):
        assert self.archiver_class is not None, "self.archiver_class must be set on the subclass"
        assert self.config is not None, "self.config must be a dict set on the subclass"
-        self.archiver: Archiver = self.archiver_class({self.archiver_class.name: self.config})
+        self.archiver: Extractor = self.archiver_class({self.archiver_class.name: self.config})
    
    def assertValidResponseMetadata(self, test_response: Metadata, title: str, timestamp: str, status: str = ""):
        assert test_response is not False
--- a/tests/formatters/test_html_formatter.py
+++ b/tests/formatters/test_html_formatter.py
@ -1,5 +1,4 @@
-from auto_archiver.core.context import ArchivingContext
-from auto_archiver.formatters.html_formatter import HtmlFormatter
+from auto_archiver.modules.html_formatter import HtmlFormatter
 from auto_archiver.core import Metadata, Media
				`@ -1 +0,0 @@`
				`""" Formatters for the output of the content. """`