Remove ArchivingContext completely

Context for a specific url/item is now passed around via the metadata (metadata.set_context('key', 'val') and metadata.get_context('key', default='something')
The only other thing that was passed around in ArchivingContext was the storage info, which is already accessible now via self.config
pull/189/head
Patrick Robertson 2025-01-30 17:50:54 +01:00
rodzic d76063c3f3
commit c25d5cae84
19 zmienionych plików z 59 dodań i 122 usunięć

Wyświetl plik

@ -4,7 +4,6 @@
from .metadata import Metadata
from .media import Media
from .module import BaseModule
from .context import ArchivingContext
# cannot import ArchivingOrchestrator/Config to avoid circular dep
# from .orchestrator import ArchivingOrchestrator

Wyświetl plik

@ -56,6 +56,10 @@ class BaseModule(ABC):
# this is set by the orchestrator prior to archiving
tmp_dir: TemporaryDirectory = None
@property
def storages(self) -> list:
return self.config.get('storages', [])
def setup(self, config: dict):
authentication = config.get('authentication', {})
@ -75,9 +79,6 @@ class BaseModule(ABC):
self.config = config
for key, val in config.get(self.name, {}).items():
setattr(self, key, val)
def repr(self):
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
def auth_for_site(self, site: str) -> dict:
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
@ -97,4 +98,7 @@ class BaseModule(ABC):
did find information for '{key}' which is close, is this what you meant? \
If so, edit your authentication settings to make sure it exactly matches.")
return {}
return {}
def repr(self):
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"

Wyświetl plik

@ -1,56 +0,0 @@
""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process.
This singleton class allows for:
- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle.
- Marking certain values to persist across resets using `keep_on_reset`.
- Managing temporary directories and other shared data used during the archiving process.
### Key Features:
- Creates a single global instance.
- Reset functionality allows for clearing configurations, with options for partial or full resets.
- Custom getters and setters for commonly used context values like temporary directories.
"""
class ArchivingContext:
"""
Singleton context class for managing global configurations and temporary data.
ArchivingContext._get_instance() to retrieve it if needed
otherwise just
ArchivingContext.set(key, value)
and
ArchivingContext.get(key, default)
When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
reset(full_reset=True) will recreate everything including the keep_on_reset status
"""
_instance = None
def __init__(self):
self.configs = {}
self.keep_on_reset = set()
@staticmethod
def get_instance():
if ArchivingContext._instance is None:
ArchivingContext._instance = ArchivingContext()
return ArchivingContext._instance
@staticmethod
def set(key, value, keep_on_reset: bool = False):
ac = ArchivingContext.get_instance()
ac.configs[key] = value
if keep_on_reset: ac.keep_on_reset.add(key)
@staticmethod
def get(key: str, default=None):
return ArchivingContext.get_instance().configs.get(key, default)
@staticmethod
def reset(full_reset: bool = False):
ac = ArchivingContext.get_instance()
if full_reset: ac.keep_on_reset = set()
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
# ---- custom getters/setters for widely used context values

Wyświetl plik

@ -17,7 +17,7 @@ from loguru import logger
from retrying import retry
import re
from ..core import Metadata, ArchivingContext, BaseModule
from ..core import Metadata, BaseModule
class Extractor(BaseModule):

Wyświetl plik

@ -11,8 +11,6 @@ from dataclasses import dataclass, field
from dataclasses_json import dataclass_json, config
import mimetypes
from .context import ArchivingContext
from loguru import logger
@ -36,12 +34,11 @@ class Media:
_mimetype: str = None # eg: image/jpeg
_stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude
def store(self: Media, override_storages: List = None, url: str = "url-not-available", metadata: Any = None):
def store(self: Media, metadata: Any, url: str = "url-not-available", storages: List[Any] = None) -> None:
# 'Any' typing for metadata to avoid circular imports. Stores the media
# into the provided/available storages [Storage] repeats the process for
# its properties, in case they have inner media themselves for now it
# only goes down 1 level but it's easy to make it recursive if needed.
storages = override_storages or ArchivingContext.get("storages")
if not len(storages):
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
return
@ -66,8 +63,9 @@ class Media:
for inner_media in prop_media.all_inner_media(include_self=True):
yield inner_media
def is_stored(self) -> bool:
return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
def is_stored(self, in_storage) -> bool:
# checks if the media is already stored in the given storage
return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url() in u])
def set(self, key: str, value: Any) -> Media:
self.properties[key] = value

Wyświetl plik

@ -20,8 +20,6 @@ from dateutil.parser import parse as parse_dt
from loguru import logger
from .media import Media
from .context import ArchivingContext
@dataclass_json # annotation order matters
@dataclass
@ -32,6 +30,7 @@ class Metadata:
def __post_init__(self):
self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc))
self._context = {}
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
@ -57,12 +56,11 @@ class Metadata:
return right.merge(self)
return self
def store(self: Metadata, override_storages: List = None):
def store(self, storages=[]):
# calls .store for all contained media. storages [Storage]
self.remove_duplicate_media_by_hash()
storages = override_storages or ArchivingContext.get("storages")
for media in self.media:
media.store(override_storages=storages, url=self.get_url(), metadata=self)
media.store(url=self.get_url(), metadata=self, storages=storages)
def set(self, key: str, val: Any) -> Metadata:
self.metadata[key] = val
@ -206,3 +204,10 @@ class Metadata:
if len(r.media) > len(most_complete.media): most_complete = r
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
return most_complete
def set_context(self, key: str, val: Any) -> Metadata:
self._context[key] = val
return self
def get_context(self, key: str, default: Any = None) -> Any:
return self._context.get(key, default)

Wyświetl plik

@ -43,7 +43,6 @@ def setup_paths(paths: list[str]) -> None:
# sort based on the length of the path, so that the longest path is last in the list
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
def get_module(module_name: str, config: dict) -> BaseModule:
"""
Gets and sets up a module using the provided config
@ -69,6 +68,7 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa
return module
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
# search through all valid 'modules' paths. Default is 'modules' in the current directory
# see odoo/modules/module.py -> get_modules

Wyświetl plik

@ -17,9 +17,8 @@ import traceback
from rich_argparse import RichHelpFormatter
from .context import ArchivingContext
from .metadata import Metadata
from .metadata import Metadata, Media
from ..version import __version__
from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
from .module import available_modules, LazyBaseModule, get_module, setup_paths
@ -268,7 +267,6 @@ class ArchivingOrchestrator:
for url in urls:
logger.debug(f"Processing URL: '{url}'")
yield Metadata().set_url(url)
ArchivingContext.set("folder", "cli")
pseudo_module = type('CLIFeeder', (Feeder,), {
'name': 'cli_feeder',
@ -297,9 +295,6 @@ class ArchivingOrchestrator:
continue
if loaded_module:
step_items.append(loaded_module)
# TODO temp solution
if module_type == "storage":
ArchivingContext.set("storages", step_items, keep_on_reset=True)
check_steps_ok()
self.config['steps'][f"{module_type}s"] = step_items
@ -449,11 +444,12 @@ class ArchivingOrchestrator:
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
# 5 - store all downloaded/generated media
result.store()
result.store(storages=self.storages)
# 6 - format and store formatted if needed
final_media: Media
if final_media := self.formatters[0].format(result):
final_media.store(url=url, metadata=result)
final_media.store(url=url, metadata=result, storages=self.storages)
result.set_final_media(final_media)
if result.is_empty():

Wyświetl plik

@ -8,16 +8,16 @@ from slugify import slugify
from auto_archiver.utils.misc import random_str
from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata
from auto_archiver.core import Media, BaseModule, Metadata
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
from auto_archiver.core.module import get_module
class Storage(BaseModule):
def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None:
if media.is_stored():
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
if media.is_stored(in_storage=self):
logger.debug(f"{media.key} already stored, skipping")
return
self.set_key(media, url)
self.set_key(media, url, metadata)
self.upload(media, metadata=metadata)
media.add_url(self.get_cdn_url(media))
@ -32,30 +32,31 @@ class Storage(BaseModule):
with open(media.filename, 'rb') as f:
return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, url) -> None:
def set_key(self, media: Media, url, metadata: Metadata) -> None:
"""takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0: return
folder = ArchivingContext.get("folder", "")
folder = metadata.folder
filename, ext = os.path.splitext(media.filename)
# Handle path_generator logic
path_generator = ArchivingContext.get("path_generator", "url")
path_generator = self.config.get("path_generator", "url")
if path_generator == "flat":
path = ""
filename = slugify(filename) # Ensure filename is slugified
elif path_generator == "url":
path = slugify(url)
elif path_generator == "random":
path = ArchivingContext.get("random_path", random_str(24), True)
path = self.config.get("random_path", random_str(24), True)
else:
raise ValueError(f"Invalid path_generator: {path_generator}")
# Handle filename_generator logic
filename_generator = ArchivingContext.get("filename_generator", "random")
filename_generator = self.config.get("filename_generator", "random")
if filename_generator == "random":
filename = random_str(24)
elif filename_generator == "static":
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
# load the hash_enricher module
he = get_module(HashEnricher, self.config)
hd = he.calculate_hash(media.filename)
filename = hd[:24]
else:

Wyświetl plik

@ -2,7 +2,7 @@ from loguru import logger
import csv
from . import Feeder
from ..core import Metadata, ArchivingContext
from ..core import Metadata
from ..utils import url_or_none
class CSVFeeder(Feeder):
@ -34,5 +34,4 @@ class CSVFeeder(Feeder):
for row in reader:
url = row[0]
logger.debug(f"Processing {url}")
yield Metadata().set_url(url)
ArchivingContext.set("folder", "cli")
yield Metadata().set_url(url)

Wyświetl plik

@ -2,7 +2,7 @@ from loguru import logger
import csv
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.core import Metadata
from auto_archiver.utils import url_or_none
class CSVFeeder(Feeder):
@ -19,5 +19,4 @@ class CSVFeeder(Feeder):
for row in reader:
url = row[0]
logger.debug(f"Processing {url}")
yield Metadata().set_url(url)
ArchivingContext.set("folder", "cli")
yield Metadata().set_url(url)

Wyświetl plik

@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor
from loguru import logger
from auto_archiver.core.extractor import Extractor
from ...core import Metadata, Media, ArchivingContext
from ...core import Metadata, Media
class GenericExtractor(Extractor):
_dropins = {}

Wyświetl plik

@ -6,7 +6,7 @@ from urllib.parse import quote
from loguru import logger
from auto_archiver.core import Database
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.gsheet_feeder import GWorksheet
@ -93,8 +93,7 @@ class GsheetsDb(Database):
logger.debug(f"Unable to update sheet: {e}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
if gsheet := ArchivingContext.get("gsheet"):
if gsheet := item.get_context("gsheet"):
gw: GWorksheet = gsheet.get("worksheet")
row: int = gsheet.get("row")
elif self.sheet_id:

Wyświetl plik

@ -15,7 +15,7 @@ from loguru import logger
from slugify import slugify
from auto_archiver.core import Feeder
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.core import Metadata
from . import GWorksheet
@ -60,17 +60,15 @@ class GsheetsFeeder(Feeder):
# All checks done - archival process starts here
m = Metadata().set_url(url)
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
if gw.get_cell_or_default(row, 'folder', "") is None:
folder = ''
else:
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
if len(folder):
if self.use_sheet_names_in_stored_paths:
ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
else:
ArchivingContext.set("folder", folder, True)
if len(folder) and self.use_sheet_names_in_stored_paths:
folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title))
m.set_context('folder', folder)
m.set_context('worksheet', {"row": row, "worksheet": gw})
yield m
logger.success(f'Finished worksheet {wks.title}')

Wyświetl plik

@ -11,7 +11,7 @@ import hashlib
from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, ArchivingContext
from auto_archiver.core import Metadata
class HashEnricher(Enricher):

Wyświetl plik

@ -16,7 +16,7 @@ from loguru import logger
from telethon.sync import TelegramClient
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
from auto_archiver.utils import random_str
@ -61,7 +61,7 @@ class InstagramTbotExtractor(Extractor):
if not "instagram.com" in url: return False
result = Metadata()
tmp_dir = ArchivingContext.get_tmp_dir()
tmp_dir = self.tmp_dir
with self.client.start():
chat = self.client.get_entity("instagram_load_bot")
since_id = self.client.send_message(entity=chat, message=url).id

Wyświetl plik

@ -4,7 +4,7 @@ from urllib.parse import urlparse
from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, ArchivingContext, Media
from auto_archiver.core import Metadata, Media
class SSLEnricher(Enricher):

Wyświetl plik

@ -3,7 +3,7 @@ import requests, time
from loguru import logger
from auto_archiver.core import Enricher
from auto_archiver.core import Metadata, Media, ArchivingContext
from auto_archiver.core import Metadata, Media
from auto_archiver.modules.s3_storage import S3Storage
from auto_archiver.core.module import get_module
@ -25,7 +25,7 @@ class WhisperEnricher(Enricher):
job_results = {}
for i, m in enumerate(to_enrich.media):
if m.is_video() or m.is_audio():
m.store(url=url, metadata=to_enrich)
m.store(url=url, metadata=to_enrich, storages=self.storages)
try:
job_id = self.submit_job(m)
job_results[job_id] = False
@ -110,7 +110,7 @@ class WhisperEnricher(Enricher):
def _get_s3_storage(self) -> S3Storage:
try:
return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
return next(s for s in self.storages if s.__class__ == S3Storage)
except:
logger.warning("No S3Storage instance found in storages")
return

Wyświetl plik

@ -1,5 +0,0 @@
import tempfile
from auto_archiver.core.context import ArchivingContext
ArchivingContext.reset(full_reset=True)