Remove ArchivingContext completely

Context for a specific url/item is now passed around via the metadata (metadata.set_context('key', 'val') and metadata.get_context('key', default='something') The only other thing that was passed around in ArchivingContext was the storage info, which is already accessible now via self.config
2025-01-30 17:50:54 +01:00 · 2025-01-30 17:50:54 +01:00 · c25d5cae84
commit c25d5cae84
--- a/src/auto_archiver/core/init.py
+++ b/src/auto_archiver/core/init.py
@ -4,7 +4,6 @@
 from .metadata import Metadata
 from .media import Media
 from .module import BaseModule
 from .context import ArchivingContext
 # cannot import ArchivingOrchestrator/Config to avoid circular dep
 # from .orchestrator import ArchivingOrchestrator
--- a/src/auto_archiver/core/base_module.py
+++ b/src/auto_archiver/core/base_module.py
@ -56,6 +56,10 @@ class BaseModule(ABC):
    # this is set by the orchestrator prior to archiving
    tmp_dir: TemporaryDirectory = None
    @property
    def storages(self) -> list:
        return self.config.get('storages', [])
    def setup(self, config: dict):
        authentication = config.get('authentication', {})
@ -75,9 +79,6 @@ class BaseModule(ABC):
        self.config = config
        for key, val in config.get(self.name, {}).items():
            setattr(self, key, val)
    def repr(self):
        return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
    def auth_for_site(self, site: str) -> dict:
        # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
@ -97,4 +98,7 @@ class BaseModule(ABC):
                                did find information for '{key}' which is close, is this what you meant? \
                                If so, edit your authentication settings to make sure it exactly matches.")
-        return {}
+        return {}
    def repr(self):
        return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
--- a/src/auto_archiver/core/context.py
+++ b/src/auto_archiver/core/context.py
@ -1,56 +0,0 @@
 """ ArchivingContext provides a global context for managing configurations and temporary data during the archiving process.
 This singleton class allows for:
 - Storing and retrieving key-value pairs that are accessible throughout the application lifecycle.
 - Marking certain values to persist across resets using `keep_on_reset`.
 - Managing temporary directories and other shared data used during the archiving process.
 ### Key Features:
 - Creates a single global instance.
 - Reset functionality allows for clearing configurations, with options for partial or full resets.
 - Custom getters and setters for commonly used context values like temporary directories.
 """
 class ArchivingContext:
    """
    Singleton context class for managing global configurations and temporary data.
    ArchivingContext._get_instance() to retrieve it if needed
    otherwise just 
    ArchivingContext.set(key, value)
    and 
    ArchivingContext.get(key, default)
    When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
        reset(full_reset=True) will recreate everything including the keep_on_reset status
    """
    _instance = None
    def __init__(self):
        self.configs = {}
        self.keep_on_reset = set()
    @staticmethod
    def get_instance():
        if ArchivingContext._instance is None:
            ArchivingContext._instance = ArchivingContext()
        return ArchivingContext._instance
    @staticmethod
    def set(key, value, keep_on_reset: bool = False):
        ac = ArchivingContext.get_instance()
        ac.configs[key] = value
        if keep_on_reset: ac.keep_on_reset.add(key)
    @staticmethod
    def get(key: str, default=None):
        return ArchivingContext.get_instance().configs.get(key, default)
    @staticmethod
    def reset(full_reset: bool = False):
        ac = ArchivingContext.get_instance()
        if full_reset: ac.keep_on_reset = set()
        ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
    # ---- custom getters/setters for widely used context values
--- a/src/auto_archiver/core/extractor.py
+++ b/src/auto_archiver/core/extractor.py
@ -17,7 +17,7 @@ from loguru import logger
 from retrying import retry
 import re
-from ..core import Metadata, ArchivingContext, BaseModule
+from ..core import Metadata, BaseModule
 class Extractor(BaseModule):
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@ -11,8 +11,6 @@ from dataclasses import dataclass, field
 from dataclasses_json import dataclass_json, config
 import mimetypes
 from .context import ArchivingContext
 from loguru import logger
@ -36,12 +34,11 @@ class Media:
    _mimetype: str = None  # eg: image/jpeg
    _stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True))  # always exclude
-    def store(self: Media, override_storages: List = None, url: str = "url-not-available", metadata: Any = None):
+    def store(self: Media, metadata: Any, url: str = "url-not-available", storages: List[Any] = None) -> None:
        # 'Any' typing for metadata to avoid circular imports. Stores the media
        # into the provided/available storages [Storage] repeats the process for
        # its properties, in case they have inner media themselves for now it
        # only goes down 1 level but it's easy to make it recursive if needed.
        storages = override_storages or ArchivingContext.get("storages")
        if not len(storages):
            logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
            return
@ -66,8 +63,9 @@ class Media:
                        for inner_media in prop_media.all_inner_media(include_self=True):
                            yield inner_media
-    def is_stored(self) -> bool:
+    def is_stored(self, in_storage) -> bool:
-        return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
+        # checks if the media is already stored in the given storage
        return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url() in u])
    def set(self, key: str, value: Any) -> Media:
        self.properties[key] = value
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@ -20,8 +20,6 @@ from dateutil.parser import parse as parse_dt
 from loguru import logger
 from .media import Media
 from .context import ArchivingContext
@dataclass_json  # annotation order matters
@dataclass
@ -32,6 +30,7 @@ class Metadata:
    def __post_init__(self):
        self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc))
        self._context = {}
    def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
        """
@ -57,12 +56,11 @@ class Metadata:
            return right.merge(self)
        return self
-    def store(self: Metadata, override_storages: List = None):
+    def store(self, storages=[]):
        # calls .store for all contained media. storages [Storage]
        self.remove_duplicate_media_by_hash()
        storages = override_storages or ArchivingContext.get("storages")
        for media in self.media:
-            media.store(override_storages=storages, url=self.get_url(), metadata=self)
+            media.store(url=self.get_url(), metadata=self, storages=storages)
    def set(self, key: str, val: Any) -> Metadata:
        self.metadata[key] = val
@ -206,3 +204,10 @@ class Metadata:
            if len(r.media) > len(most_complete.media): most_complete = r
            elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
        return most_complete
    def set_context(self, key: str, val: Any) -> Metadata:
        self._context[key] = val
        return self
    def get_context(self, key: str, default: Any = None) -> Any:
        return self._context.get(key, default)
--- a/src/auto_archiver/core/module.py
+++ b/src/auto_archiver/core/module.py
@ -43,7 +43,6 @@ def setup_paths(paths: list[str]) -> None:
    # sort based on the length of the path, so that the longest path is last in the list
    auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
 def get_module(module_name: str, config: dict) -> BaseModule:
    """
    Gets and sets up a module using the provided config
@ -69,6 +68,7 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa
    return module
 def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
    # search through all valid 'modules' paths. Default is 'modules' in the current directory
    # see odoo/modules/module.py -> get_modules
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@ -17,9 +17,8 @@ import traceback
 from rich_argparse import RichHelpFormatter
 from .context import ArchivingContext
-from .metadata import Metadata
+from .metadata import Metadata, Media
 from ..version import __version__
 from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
 from .module import available_modules, LazyBaseModule, get_module, setup_paths
@ -268,7 +267,6 @@ class ArchivingOrchestrator:
                        for url in urls:
                            logger.debug(f"Processing URL: '{url}'")
                            yield Metadata().set_url(url)
                            ArchivingContext.set("folder", "cli")
                    pseudo_module = type('CLIFeeder', (Feeder,), {
                        'name': 'cli_feeder',
@ -297,9 +295,6 @@ class ArchivingOrchestrator:
                    continue
                if loaded_module:
                    step_items.append(loaded_module)
                    # TODO temp solution
                    if module_type == "storage":
                        ArchivingContext.set("storages", step_items, keep_on_reset=True)
            check_steps_ok()
            self.config['steps'][f"{module_type}s"] = step_items
@ -449,11 +444,12 @@ class ArchivingOrchestrator:
                logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
        # 5 - store all downloaded/generated media
-        result.store()
+        result.store(storages=self.storages)
        # 6 - format and store formatted if needed
        final_media: Media
        if final_media := self.formatters[0].format(result):
-            final_media.store(url=url, metadata=result)
+            final_media.store(url=url, metadata=result, storages=self.storages)
            result.set_final_media(final_media)
        if result.is_empty():
--- a/src/auto_archiver/core/storage.py
+++ b/src/auto_archiver/core/storage.py
@ -8,16 +8,16 @@ from slugify import slugify
 from auto_archiver.utils.misc import random_str
-from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata
+from auto_archiver.core import Media, BaseModule, Metadata
 from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
-
+from auto_archiver.core.module import get_module
 class Storage(BaseModule):
-    def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None:
+    def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
-        if media.is_stored(): 
+        if media.is_stored(in_storage=self): 
            logger.debug(f"{media.key} already stored, skipping")
            return
-        self.set_key(media, url)
+        self.set_key(media, url, metadata)
        self.upload(media, metadata=metadata)
        media.add_url(self.get_cdn_url(media))
@ -32,30 +32,31 @@ class Storage(BaseModule):
        with open(media.filename, 'rb') as f:
            return self.uploadf(f, media, **kwargs)
-    def set_key(self, media: Media, url) -> None:
+    def set_key(self, media: Media, url, metadata: Metadata) -> None:
        """takes the media and optionally item info and generates a key"""
        if media.key is not None and len(media.key) > 0: return
-        folder = ArchivingContext.get("folder", "")
+        folder = metadata.folder
        filename, ext = os.path.splitext(media.filename)
        # Handle path_generator logic
-        path_generator = ArchivingContext.get("path_generator", "url")
+        path_generator = self.config.get("path_generator", "url")
        if path_generator == "flat":
            path = ""
            filename = slugify(filename)  # Ensure filename is slugified
        elif path_generator == "url":
            path = slugify(url)
        elif path_generator == "random":
-            path = ArchivingContext.get("random_path", random_str(24), True)
+            path = self.config.get("random_path", random_str(24), True)
        else:
            raise ValueError(f"Invalid path_generator: {path_generator}")
        # Handle filename_generator logic
-        filename_generator = ArchivingContext.get("filename_generator", "random")
+        filename_generator = self.config.get("filename_generator", "random")
        if filename_generator == "random":
            filename = random_str(24)
        elif filename_generator == "static":
-            he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
+            # load the hash_enricher module
            he = get_module(HashEnricher, self.config)
            hd = he.calculate_hash(media.filename)
            filename = hd[:24]
        else:
--- a/src/auto_archiver/feeders/csv_feeder.py
+++ b/src/auto_archiver/feeders/csv_feeder.py
@ -2,7 +2,7 @@ from loguru import logger
 import csv
 from . import Feeder
-from ..core import Metadata, ArchivingContext
+from ..core import Metadata
 from ..utils import url_or_none
 class CSVFeeder(Feeder):
@ -34,5 +34,4 @@ class CSVFeeder(Feeder):
                for row in reader:
                    url = row[0]
                    logger.debug(f"Processing {url}")
-                    yield Metadata().set_url(url)
+                    yield Metadata().set_url(url)
            ArchivingContext.set("folder", "cli")
--- a/src/auto_archiver/modules/csv_feeder/csv_feeder.py
+++ b/src/auto_archiver/modules/csv_feeder/csv_feeder.py
@ -2,7 +2,7 @@ from loguru import logger
 import csv
 from auto_archiver.core import Feeder
-from auto_archiver.core import Metadata, ArchivingContext
+from auto_archiver.core import Metadata
 from auto_archiver.utils import url_or_none
 class CSVFeeder(Feeder):
@ -19,5 +19,4 @@ class CSVFeeder(Feeder):
                for row in reader:
                    url = row[0]
                    logger.debug(f"Processing {url}")
-                    yield Metadata().set_url(url)
+                    yield Metadata().set_url(url)
            ArchivingContext.set("folder", "cli")
--- a/src/auto_archiver/modules/generic_extractor/generic_extractor.py
+++ b/src/auto_archiver/modules/generic_extractor/generic_extractor.py
@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor
 from loguru import logger
 from auto_archiver.core.extractor import Extractor
-from ...core import Metadata, Media, ArchivingContext
+from ...core import Metadata, Media
 class GenericExtractor(Extractor):
    _dropins = {}
--- a/src/auto_archiver/modules/gsheet_db/gsheet_db.py
+++ b/src/auto_archiver/modules/gsheet_db/gsheet_db.py
@ -6,7 +6,7 @@ from urllib.parse import quote
 from loguru import logger
 from auto_archiver.core import Database
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.modules.gsheet_feeder import GWorksheet
@ -93,8 +93,7 @@ class GsheetsDb(Database):
            logger.debug(f"Unable to update sheet: {e}")
    def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
-        # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
+        if gsheet := item.get_context("gsheet"):
        if gsheet := ArchivingContext.get("gsheet"):
            gw: GWorksheet = gsheet.get("worksheet")
            row: int = gsheet.get("row")
        elif self.sheet_id:
--- a/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
+++ b/src/auto_archiver/modules/gsheet_feeder/gsheet_feeder.py
@ -15,7 +15,7 @@ from loguru import logger
 from slugify import slugify
 from auto_archiver.core import Feeder
-from auto_archiver.core import Metadata, ArchivingContext
+from auto_archiver.core import Metadata
 from . import GWorksheet
@ -60,17 +60,15 @@ class GsheetsFeeder(Feeder):
                # All checks done - archival process starts here
                m = Metadata().set_url(url)
                ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
                if gw.get_cell_or_default(row, 'folder', "") is None:
                    folder = ''
                else:
                    folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
-                if len(folder):
+                if len(folder) and self.use_sheet_names_in_stored_paths:
-                    if self.use_sheet_names_in_stored_paths:
+                    folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title))
                        ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
                    else:
                        ArchivingContext.set("folder", folder, True)
                m.set_context('folder', folder)
                m.set_context('worksheet', {"row": row, "worksheet": gw})
                yield m
            logger.success(f'Finished worksheet {wks.title}')
--- a/src/auto_archiver/modules/hash_enricher/hash_enricher.py
+++ b/src/auto_archiver/modules/hash_enricher/hash_enricher.py
@ -11,7 +11,7 @@ import hashlib
 from loguru import logger
 from auto_archiver.core import Enricher
-from auto_archiver.core import Metadata, ArchivingContext
+from auto_archiver.core import Metadata
 class HashEnricher(Enricher):
--- a/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
+++ b/src/auto_archiver/modules/instagram_tbot_extractor/instagram_tbot_extractor.py
@ -16,7 +16,7 @@ from loguru import logger
 from telethon.sync import TelegramClient
 from auto_archiver.core import Extractor
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.utils import random_str
@ -61,7 +61,7 @@ class InstagramTbotExtractor(Extractor):
        if not "instagram.com" in url: return False
        result = Metadata()
-        tmp_dir = ArchivingContext.get_tmp_dir()
+        tmp_dir = self.tmp_dir
        with self.client.start():
            chat = self.client.get_entity("instagram_load_bot")
            since_id = self.client.send_message(entity=chat, message=url).id
--- a/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
+++ b/src/auto_archiver/modules/ssl_enricher/ssl_enricher.py
@ -4,7 +4,7 @@ from urllib.parse import urlparse
 from loguru import logger
 from auto_archiver.core import Enricher
-from auto_archiver.core import Metadata, ArchivingContext, Media
+from auto_archiver.core import Metadata, Media
 class SSLEnricher(Enricher):
--- a/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
+++ b/src/auto_archiver/modules/whisper_enricher/whisper_enricher.py
@ -3,7 +3,7 @@ import requests, time
 from loguru import logger
 from auto_archiver.core import Enricher
-from auto_archiver.core import Metadata, Media, ArchivingContext
+from auto_archiver.core import Metadata, Media
 from auto_archiver.modules.s3_storage import S3Storage
 from auto_archiver.core.module import get_module
@ -25,7 +25,7 @@ class WhisperEnricher(Enricher):
        job_results = {}
        for i, m in enumerate(to_enrich.media):
            if m.is_video() or m.is_audio():
-                m.store(url=url, metadata=to_enrich)
+                m.store(url=url, metadata=to_enrich, storages=self.storages)
                try:
                    job_id = self.submit_job(m)
                    job_results[job_id] = False
@ -110,7 +110,7 @@ class WhisperEnricher(Enricher):
    def _get_s3_storage(self) -> S3Storage:
        try:
-            return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
+            return next(s for s in self.storages if s.__class__ == S3Storage)
        except:
            logger.warning("No S3Storage instance found in storages")
            return
--- a/tests/init.py
+++ b/tests/init.py
@ -1,5 +0,0 @@
 import tempfile
 from auto_archiver.core.context import ArchivingContext
 ArchivingContext.reset(full_reset=True)