Remove ArchivingContext completely

Context for a specific url/item is now passed around via the metadata (metadata.set_context('key', 'val') and metadata.get_context('key', default='something') The only other thing that was passed around in ArchivingContext was the storage info, which is already accessible now via self.config
2025-01-30 17:50:54 +01:00 · 2025-01-30 17:50:54 +01:00 · c25d5cae84
commit c25d5cae84
--- a/src/auto_archiver/core/init.py
+++ b/src/auto_archiver/core/init.py
@ -4,7 +4,6 @@
 from .metadata import Metadata
 from .media import Media
 from .module import BaseModule
-from .context import ArchivingContext

 # cannot import ArchivingOrchestrator/Config to avoid circular dep
 # from .orchestrator import ArchivingOrchestrator
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@ -56,6 +56,10 @@ class BaseModule(ABC):
    # this is set by the orchestrator prior to archiving
    tmp_dir: TemporaryDirectory = None

+    @property
+    def storages(self) -> list:
+        return self.config.get('storages', [])
+
    def setup(self, config: dict):

        authentication = config.get('authentication', {})
@ -75,9 +79,6 @@ class BaseModule(ABC):
        self.config = config
        for key, val in config.get(self.name, {}).items():
            setattr(self, key, val)
-
-    def repr(self):
-        return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
    
    def auth_for_site(self, site: str) -> dict:
        # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
@ -97,4 +98,7 @@ class BaseModule(ABC):
                                did find information for '{key}' which is close, is this what you meant? \
                                If so, edit your authentication settings to make sure it exactly matches.")
        
-        return {}
+        return {}
+    
+    def repr(self):
+        return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
--- a/src/auto_archiver/core/context.py
+++ b/src/auto_archiver/core/context.py
@ -1,56 +0,0 @@
-""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process.
-
-This singleton class allows for:
- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle.
- Marking certain values to persist across resets using `keep_on_reset`.
- Managing temporary directories and other shared data used during the archiving process.
-
-### Key Features:
- Creates a single global instance.
- Reset functionality allows for clearing configurations, with options for partial or full resets.
- Custom getters and setters for commonly used context values like temporary directories.
-
-"""
-
-class ArchivingContext:
-    """
-    Singleton context class for managing global configurations and temporary data.
-
-    ArchivingContext._get_instance() to retrieve it if needed
-    otherwise just 
-    ArchivingContext.set(key, value)
-    and 
-    ArchivingContext.get(key, default)
-
-    When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
-        reset(full_reset=True) will recreate everything including the keep_on_reset status
-    """
-    _instance = None
-
-    def __init__(self):
-        self.configs = {}
-        self.keep_on_reset = set()
-
-    @staticmethod
-    def get_instance():
-        if ArchivingContext._instance is None:
-            ArchivingContext._instance = ArchivingContext()
-        return ArchivingContext._instance
-
-    @staticmethod
-    def set(key, value, keep_on_reset: bool = False):
-        ac = ArchivingContext.get_instance()
-        ac.configs[key] = value
-        if keep_on_reset: ac.keep_on_reset.add(key)
-
-    @staticmethod
-    def get(key: str, default=None):
-        return ArchivingContext.get_instance().configs.get(key, default)
-
-    @staticmethod
-    def reset(full_reset: bool = False):
-        ac = ArchivingContext.get_instance()
-        if full_reset: ac.keep_on_reset = set()
-        ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
-
-    # ---- custom getters/setters for widely used context values
--- a/src/auto_archiver/core/extractor.py
+++ b/src/auto_archiver/core/extractor.py
@ -17,7 +17,7 @@ from loguru import logger
 from retrying import retry
 import re

-from ..core import Metadata, ArchivingContext, BaseModule
+from ..core import Metadata, BaseModule


 class Extractor(BaseModule):
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@ -11,8 +11,6 @@ from dataclasses import dataclass, field
 from dataclasses_json import dataclass_json, config
 import mimetypes

-from .context import ArchivingContext
-
 from loguru import logger


@ -36,12 +34,11 @@ class Media:
    _mimetype: str = None  # eg: image/jpeg
    _stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True))  # always exclude

-    def store(self: Media, override_storages: List = None, url: str = "url-not-available", metadata: Any = None):
+    def store(self: Media, metadata: Any, url: str = "url-not-available", storages: List[Any] = None) -> None:
        # 'Any' typing for metadata to avoid circular imports. Stores the media
        # into the provided/available storages [Storage] repeats the process for
        # its properties, in case they have inner media themselves for now it
        # only goes down 1 level but it's easy to make it recursive if needed.
-        storages = override_storages or ArchivingContext.get("storages")
        if not len(storages):
            logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
            return
@ -66,8 +63,9 @@ class Media:
                        for inner_media in prop_media.all_inner_media(include_self=True):
                            yield inner_media

-    def is_stored(self) -> bool:
-        return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
+    def is_stored(self, in_storage) -> bool:
+        # checks if the media is already stored in the given storage
+        return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url() in u])

    def set(self, key: str, value: Any) -> Media:
        self.properties[key] = value
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@ -20,8 +20,6 @@ from dateutil.parser import parse as parse_dt
 from loguru import logger

 from .media import Media
-from .context import ArchivingContext
-

@dataclass_json  # annotation order matters
@dataclass
@ -32,6 +30,7 @@ class Metadata:

    def __post_init__(self):
        self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc))
+        self._context = {}

    def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
        """
@ -57,12 +56,11 @@ class Metadata:
            return right.merge(self)
        return self

-    def store(self: Metadata, override_storages: List = None):
+    def store(self, storages=[]):
        # calls .store for all contained media. storages [Storage]
        self.remove_duplicate_media_by_hash()
-        storages = override_storages or ArchivingContext.get("storages")
        for media in self.media:
-            media.store(override_storages=storages, url=self.get_url(), metadata=self)
+            media.store(url=self.get_url(), metadata=self, storages=storages)

    def set(self, key: str, val: Any) -> Metadata:
        self.metadata[key] = val
@ -206,3 +204,10 @@ class Metadata:
            if len(r.media) > len(most_complete.media): most_complete = r
            elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
        return most_complete
+
+    def set_context(self, key: str, val: Any) -> Metadata:
+        self._context[key] = val
+        return self
+    
+    def get_context(self, key: str, default: Any = None) -> Any:
+        return self._context.get(key, default)
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@ -43,7 +43,6 @@ def setup_paths(paths: list[str]) -> None:
    # sort based on the length of the path, so that the longest path is last in the list
    auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)

-
 def get_module(module_name: str, config: dict) -> BaseModule:
    """
    Gets and sets up a module using the provided config
@ -69,6 +68,7 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa
    return module

 def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
+    
    # search through all valid 'modules' paths. Default is 'modules' in the current directory

    # see odoo/modules/module.py -> get_modules
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -17,9 +17,8 @@ import traceback

 from rich_argparse import RichHelpFormatter

-from .context import ArchivingContext

-from .metadata import Metadata
+from .metadata import Metadata, Media
 from ..version import __version__
 from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
 from .module import available_modules, LazyBaseModule, get_module, setup_paths
@ -268,7 +267,6 @@ class ArchivingOrchestrator:
                        for url in urls:
                            logger.debug(f"Processing URL: '{url}'")
                            yield Metadata().set_url(url)
-                            ArchivingContext.set("folder", "cli")

                    pseudo_module = type('CLIFeeder', (Feeder,), {
                        'name': 'cli_feeder',
@ -297,9 +295,6 @@ class ArchivingOrchestrator:
                    continue
                if loaded_module:
                    step_items.append(loaded_module)
-                    # TODO temp solution
-                    if module_type == "storage":
-                        ArchivingContext.set("storages", step_items, keep_on_reset=True)

            check_steps_ok()
            self.config['steps'][f"{module_type}s"] = step_items
@ -449,11 +444,12 @@ class ArchivingOrchestrator:
                logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")

        # 5 - store all downloaded/generated media
-        result.store()
+        result.store(storages=self.storages)

        # 6 - format and store formatted if needed
+        final_media: Media
        if final_media := self.formatters[0].format(result):
-            final_media.store(url=url, metadata=result)
+            final_media.store(url=url, metadata=result, storages=self.storages)
            result.set_final_media(final_media)

        if result.is_empty():
--- a/src/auto_archiver/core/storage.py
+++ b/src/auto_archiver/core/storage.py
@ -8,16 +8,16 @@ from slugify import slugify

 from auto_archiver.utils.misc import random_str

-from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata
+from auto_archiver.core import Media, BaseModule, Metadata
 from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
-
+from auto_archiver.core.module import get_module
 class Storage(BaseModule):

-    def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None:
-        if media.is_stored(): 
+    def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
+        if media.is_stored(in_storage=self): 
            logger.debug(f"{media.key} already stored, skipping")
            return
-        self.set_key(media, url)
+        self.set_key(media, url, metadata)
        self.upload(media, metadata=metadata)
        media.add_url(self.get_cdn_url(media))

@ -32,30 +32,31 @@ class Storage(BaseModule):
        with open(media.filename, 'rb') as f:
            return self.uploadf(f, media, **kwargs)

-    def set_key(self, media: Media, url) -> None:
+    def set_key(self, media: Media, url, metadata: Metadata) -> None:
        """takes the media and optionally item info and generates a key"""
        if media.key is not None and len(media.key) > 0: return
-        folder = ArchivingContext.get("folder", "")
+        folder = metadata.folder
        filename, ext = os.path.splitext(media.filename)

        # Handle path_generator logic
-        path_generator = ArchivingContext.get("path_generator", "url")
+        path_generator = self.config.get("path_generator", "url")
        if path_generator == "flat":
            path = ""
            filename = slugify(filename)  # Ensure filename is slugified
        elif path_generator == "url":
            path = slugify(url)
        elif path_generator == "random":
-            path = ArchivingContext.get("random_path", random_str(24), True)
+            path = self.config.get("random_path", random_str(24), True)
        else:
            raise ValueError(f"Invalid path_generator: {path_generator}")

        # Handle filename_generator logic
-        filename_generator = ArchivingContext.get("filename_generator", "random")
+        filename_generator = self.config.get("filename_generator", "random")
        if filename_generator == "random":
            filename = random_str(24)
        elif filename_generator == "static":
-            he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
+            # load the hash_enricher module
+            he = get_module(HashEnricher, self.config)
            hd = he.calculate_hash(media.filename)
            filename = hd[:24]
        else:
--- a/src/auto_archiver/feeders/csv_feeder.py
+++ b/src/auto_archiver/feeders/csv_feeder.py
@ -2,7 +2,7 @@ from loguru import logger
 import csv

 from . import Feeder
-from ..core import Metadata, ArchivingContext
+from ..core import Metadata
 from ..utils import url_or_none

 class CSVFeeder(Feeder):
@ -34,5 +34,4 @@ class CSVFeeder(Feeder):
                for row in reader:
                    url = row[0]
                    logger.debug(f"Processing {url}")
-                    yield Metadata().set_url(url)
-            ArchivingContext.set("folder", "cli")
+                    yield Metadata().set_url(url)
--- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py
+++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
@ -2,7 +2,7 @@ from loguru import logger
 import csv

 from auto_archiver.core import Feeder
-from auto_archiver.core import Metadata, ArchivingContext
+from auto_archiver.core import Metadata
 from auto_archiver.utils import url_or_none

 class CSVFeeder(Feeder):
@ -19,5 +19,4 @@ class CSVFeeder(Feeder):
                for row in reader:
                    url = row[0]
                    logger.debug(f"Processing {url}")
-                    yield Metadata().set_url(url)
-            ArchivingContext.set("folder", "cli")
+                    yield Metadata().set_url(url)
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor
 from loguru import logger

 from auto_archiver.core.extractor import Extractor
-from ...core import Metadata, Media, ArchivingContext
+from ...core import Metadata, Media

 class GenericExtractor(Extractor):
    _dropins = {}
--- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py
+++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
@ -6,7 +6,7 @@ from urllib.parse import quote
 from loguru import logger

 from auto_archiver.core import Database
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.modules.gsheet_feeder import GWorksheet


@ -93,8 +93,7 @@ class GsheetsDb(Database):
            logger.debug(f"Unable to update sheet: {e}")

    def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
-        # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
-        if gsheet := ArchivingContext.get("gsheet"):
+        if gsheet := item.get_context("gsheet"):
            gw: GWorksheet = gsheet.get("worksheet")
            row: int = gsheet.get("row")
        elif self.sheet_id:
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@ -15,7 +15,7 @@ from loguru import logger
 from slugify import slugify

 from auto_archiver.core import Feeder
-from auto_archiver.core import Metadata, ArchivingContext
+from auto_archiver.core import Metadata
 from . import GWorksheet


@ -60,17 +60,15 @@ class GsheetsFeeder(Feeder):

                # All checks done - archival process starts here
                m = Metadata().set_url(url)
-                ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
                if gw.get_cell_or_default(row, 'folder', "") is None:
                    folder = ''
                else:
                    folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
-                if len(folder):
-                    if self.use_sheet_names_in_stored_paths:
-                        ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
-                    else:
-                        ArchivingContext.set("folder", folder, True)
+                if len(folder) and self.use_sheet_names_in_stored_paths:
+                    folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title))

+                m.set_context('folder', folder)
+                m.set_context('worksheet', {"row": row, "worksheet": gw})
                yield m

            logger.success(f'Finished worksheet {wks.title}')
--- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py
+++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py
@ -11,7 +11,7 @@ import hashlib
 from loguru import logger

 from auto_archiver.core import Enricher
-from auto_archiver.core import Metadata, ArchivingContext
+from auto_archiver.core import Metadata


 class HashEnricher(Enricher):
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@ -16,7 +16,7 @@ from loguru import logger
 from telethon.sync import TelegramClient

 from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.utils import random_str


@ -61,7 +61,7 @@ class InstagramTbotExtractor(Extractor):
        if not "instagram.com" in url: return False

        result = Metadata()
-        tmp_dir = ArchivingContext.get_tmp_dir()
+        tmp_dir = self.tmp_dir
        with self.client.start():
            chat = self.client.get_entity("instagram_load_bot")
            since_id = self.client.send_message(entity=chat, message=url).id
--- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
+++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
@ -4,7 +4,7 @@ from urllib.parse import urlparse
 from loguru import logger

 from auto_archiver.core import Enricher
-from auto_archiver.core import Metadata, ArchivingContext, Media
+from auto_archiver.core import Metadata, Media


 class SSLEnricher(Enricher):
--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@ -3,7 +3,7 @@ import requests, time
 from loguru import logger

 from auto_archiver.core import Enricher
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.modules.s3_storage import S3Storage
 from auto_archiver.core.module import get_module

@ -25,7 +25,7 @@ class WhisperEnricher(Enricher):
        job_results = {}
        for i, m in enumerate(to_enrich.media):
            if m.is_video() or m.is_audio():
-                m.store(url=url, metadata=to_enrich)
+                m.store(url=url, metadata=to_enrich, storages=self.storages)
                try:
                    job_id = self.submit_job(m)
                    job_results[job_id] = False
@ -110,7 +110,7 @@ class WhisperEnricher(Enricher):

    def _get_s3_storage(self) -> S3Storage:
        try:
-            return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
+            return next(s for s in self.storages if s.__class__ == S3Storage)
        except:
            logger.warning("No S3Storage instance found in storages")
            return
--- a/tests/init.py
+++ b/tests/init.py
@ -1,5 +0,0 @@
-import tempfile
-
-from auto_archiver.core.context import ArchivingContext
-
-ArchivingContext.reset(full_reset=True)