kopia lustrzana https://github.com/bellingcat/auto-archiver
Remove ArchivingContext completely
Context for a specific url/item is now passed around via the metadata (metadata.set_context('key', 'val') and metadata.get_context('key', default='something') The only other thing that was passed around in ArchivingContext was the storage info, which is already accessible now via self.configpull/189/head
rodzic
d76063c3f3
commit
c25d5cae84
|
@ -4,7 +4,6 @@
|
|||
from .metadata import Metadata
|
||||
from .media import Media
|
||||
from .module import BaseModule
|
||||
from .context import ArchivingContext
|
||||
|
||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||
# from .orchestrator import ArchivingOrchestrator
|
||||
|
|
|
@ -56,6 +56,10 @@ class BaseModule(ABC):
|
|||
# this is set by the orchestrator prior to archiving
|
||||
tmp_dir: TemporaryDirectory = None
|
||||
|
||||
@property
|
||||
def storages(self) -> list:
|
||||
return self.config.get('storages', [])
|
||||
|
||||
def setup(self, config: dict):
|
||||
|
||||
authentication = config.get('authentication', {})
|
||||
|
@ -75,9 +79,6 @@ class BaseModule(ABC):
|
|||
self.config = config
|
||||
for key, val in config.get(self.name, {}).items():
|
||||
setattr(self, key, val)
|
||||
|
||||
def repr(self):
|
||||
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
|
||||
|
||||
def auth_for_site(self, site: str) -> dict:
|
||||
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
|
||||
|
@ -97,4 +98,7 @@ class BaseModule(ABC):
|
|||
did find information for '{key}' which is close, is this what you meant? \
|
||||
If so, edit your authentication settings to make sure it exactly matches.")
|
||||
|
||||
return {}
|
||||
return {}
|
||||
|
||||
def repr(self):
|
||||
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
|
|
@ -1,56 +0,0 @@
|
|||
""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process.
|
||||
|
||||
This singleton class allows for:
|
||||
- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle.
|
||||
- Marking certain values to persist across resets using `keep_on_reset`.
|
||||
- Managing temporary directories and other shared data used during the archiving process.
|
||||
|
||||
### Key Features:
|
||||
- Creates a single global instance.
|
||||
- Reset functionality allows for clearing configurations, with options for partial or full resets.
|
||||
- Custom getters and setters for commonly used context values like temporary directories.
|
||||
|
||||
"""
|
||||
|
||||
class ArchivingContext:
|
||||
"""
|
||||
Singleton context class for managing global configurations and temporary data.
|
||||
|
||||
ArchivingContext._get_instance() to retrieve it if needed
|
||||
otherwise just
|
||||
ArchivingContext.set(key, value)
|
||||
and
|
||||
ArchivingContext.get(key, default)
|
||||
|
||||
When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
|
||||
reset(full_reset=True) will recreate everything including the keep_on_reset status
|
||||
"""
|
||||
_instance = None
|
||||
|
||||
def __init__(self):
|
||||
self.configs = {}
|
||||
self.keep_on_reset = set()
|
||||
|
||||
@staticmethod
|
||||
def get_instance():
|
||||
if ArchivingContext._instance is None:
|
||||
ArchivingContext._instance = ArchivingContext()
|
||||
return ArchivingContext._instance
|
||||
|
||||
@staticmethod
|
||||
def set(key, value, keep_on_reset: bool = False):
|
||||
ac = ArchivingContext.get_instance()
|
||||
ac.configs[key] = value
|
||||
if keep_on_reset: ac.keep_on_reset.add(key)
|
||||
|
||||
@staticmethod
|
||||
def get(key: str, default=None):
|
||||
return ArchivingContext.get_instance().configs.get(key, default)
|
||||
|
||||
@staticmethod
|
||||
def reset(full_reset: bool = False):
|
||||
ac = ArchivingContext.get_instance()
|
||||
if full_reset: ac.keep_on_reset = set()
|
||||
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
|
||||
|
||||
# ---- custom getters/setters for widely used context values
|
|
@ -17,7 +17,7 @@ from loguru import logger
|
|||
from retrying import retry
|
||||
import re
|
||||
|
||||
from ..core import Metadata, ArchivingContext, BaseModule
|
||||
from ..core import Metadata, BaseModule
|
||||
|
||||
|
||||
class Extractor(BaseModule):
|
||||
|
|
|
@ -11,8 +11,6 @@ from dataclasses import dataclass, field
|
|||
from dataclasses_json import dataclass_json, config
|
||||
import mimetypes
|
||||
|
||||
from .context import ArchivingContext
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
|
@ -36,12 +34,11 @@ class Media:
|
|||
_mimetype: str = None # eg: image/jpeg
|
||||
_stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude
|
||||
|
||||
def store(self: Media, override_storages: List = None, url: str = "url-not-available", metadata: Any = None):
|
||||
def store(self: Media, metadata: Any, url: str = "url-not-available", storages: List[Any] = None) -> None:
|
||||
# 'Any' typing for metadata to avoid circular imports. Stores the media
|
||||
# into the provided/available storages [Storage] repeats the process for
|
||||
# its properties, in case they have inner media themselves for now it
|
||||
# only goes down 1 level but it's easy to make it recursive if needed.
|
||||
storages = override_storages or ArchivingContext.get("storages")
|
||||
if not len(storages):
|
||||
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
|
||||
return
|
||||
|
@ -66,8 +63,9 @@ class Media:
|
|||
for inner_media in prop_media.all_inner_media(include_self=True):
|
||||
yield inner_media
|
||||
|
||||
def is_stored(self) -> bool:
|
||||
return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
|
||||
def is_stored(self, in_storage) -> bool:
|
||||
# checks if the media is already stored in the given storage
|
||||
return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url() in u])
|
||||
|
||||
def set(self, key: str, value: Any) -> Media:
|
||||
self.properties[key] = value
|
||||
|
|
|
@ -20,8 +20,6 @@ from dateutil.parser import parse as parse_dt
|
|||
from loguru import logger
|
||||
|
||||
from .media import Media
|
||||
from .context import ArchivingContext
|
||||
|
||||
|
||||
@dataclass_json # annotation order matters
|
||||
@dataclass
|
||||
|
@ -32,6 +30,7 @@ class Metadata:
|
|||
|
||||
def __post_init__(self):
|
||||
self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc))
|
||||
self._context = {}
|
||||
|
||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||
"""
|
||||
|
@ -57,12 +56,11 @@ class Metadata:
|
|||
return right.merge(self)
|
||||
return self
|
||||
|
||||
def store(self: Metadata, override_storages: List = None):
|
||||
def store(self, storages=[]):
|
||||
# calls .store for all contained media. storages [Storage]
|
||||
self.remove_duplicate_media_by_hash()
|
||||
storages = override_storages or ArchivingContext.get("storages")
|
||||
for media in self.media:
|
||||
media.store(override_storages=storages, url=self.get_url(), metadata=self)
|
||||
media.store(url=self.get_url(), metadata=self, storages=storages)
|
||||
|
||||
def set(self, key: str, val: Any) -> Metadata:
|
||||
self.metadata[key] = val
|
||||
|
@ -206,3 +204,10 @@ class Metadata:
|
|||
if len(r.media) > len(most_complete.media): most_complete = r
|
||||
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
|
||||
return most_complete
|
||||
|
||||
def set_context(self, key: str, val: Any) -> Metadata:
|
||||
self._context[key] = val
|
||||
return self
|
||||
|
||||
def get_context(self, key: str, default: Any = None) -> Any:
|
||||
return self._context.get(key, default)
|
|
@ -43,7 +43,6 @@ def setup_paths(paths: list[str]) -> None:
|
|||
# sort based on the length of the path, so that the longest path is last in the list
|
||||
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
|
||||
|
||||
|
||||
def get_module(module_name: str, config: dict) -> BaseModule:
|
||||
"""
|
||||
Gets and sets up a module using the provided config
|
||||
|
@ -69,6 +68,7 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa
|
|||
return module
|
||||
|
||||
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||
|
||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||
|
||||
# see odoo/modules/module.py -> get_modules
|
||||
|
|
|
@ -17,9 +17,8 @@ import traceback
|
|||
|
||||
from rich_argparse import RichHelpFormatter
|
||||
|
||||
from .context import ArchivingContext
|
||||
|
||||
from .metadata import Metadata
|
||||
from .metadata import Metadata, Media
|
||||
from ..version import __version__
|
||||
from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||
from .module import available_modules, LazyBaseModule, get_module, setup_paths
|
||||
|
@ -268,7 +267,6 @@ class ArchivingOrchestrator:
|
|||
for url in urls:
|
||||
logger.debug(f"Processing URL: '{url}'")
|
||||
yield Metadata().set_url(url)
|
||||
ArchivingContext.set("folder", "cli")
|
||||
|
||||
pseudo_module = type('CLIFeeder', (Feeder,), {
|
||||
'name': 'cli_feeder',
|
||||
|
@ -297,9 +295,6 @@ class ArchivingOrchestrator:
|
|||
continue
|
||||
if loaded_module:
|
||||
step_items.append(loaded_module)
|
||||
# TODO temp solution
|
||||
if module_type == "storage":
|
||||
ArchivingContext.set("storages", step_items, keep_on_reset=True)
|
||||
|
||||
check_steps_ok()
|
||||
self.config['steps'][f"{module_type}s"] = step_items
|
||||
|
@ -449,11 +444,12 @@ class ArchivingOrchestrator:
|
|||
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||
|
||||
# 5 - store all downloaded/generated media
|
||||
result.store()
|
||||
result.store(storages=self.storages)
|
||||
|
||||
# 6 - format and store formatted if needed
|
||||
final_media: Media
|
||||
if final_media := self.formatters[0].format(result):
|
||||
final_media.store(url=url, metadata=result)
|
||||
final_media.store(url=url, metadata=result, storages=self.storages)
|
||||
result.set_final_media(final_media)
|
||||
|
||||
if result.is_empty():
|
||||
|
|
|
@ -8,16 +8,16 @@ from slugify import slugify
|
|||
|
||||
from auto_archiver.utils.misc import random_str
|
||||
|
||||
from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata
|
||||
from auto_archiver.core import Media, BaseModule, Metadata
|
||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||
|
||||
from auto_archiver.core.module import get_module
|
||||
class Storage(BaseModule):
|
||||
|
||||
def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None:
|
||||
if media.is_stored():
|
||||
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
|
||||
if media.is_stored(in_storage=self):
|
||||
logger.debug(f"{media.key} already stored, skipping")
|
||||
return
|
||||
self.set_key(media, url)
|
||||
self.set_key(media, url, metadata)
|
||||
self.upload(media, metadata=metadata)
|
||||
media.add_url(self.get_cdn_url(media))
|
||||
|
||||
|
@ -32,30 +32,31 @@ class Storage(BaseModule):
|
|||
with open(media.filename, 'rb') as f:
|
||||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, url) -> None:
|
||||
def set_key(self, media: Media, url, metadata: Metadata) -> None:
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
if media.key is not None and len(media.key) > 0: return
|
||||
folder = ArchivingContext.get("folder", "")
|
||||
folder = metadata.folder
|
||||
filename, ext = os.path.splitext(media.filename)
|
||||
|
||||
# Handle path_generator logic
|
||||
path_generator = ArchivingContext.get("path_generator", "url")
|
||||
path_generator = self.config.get("path_generator", "url")
|
||||
if path_generator == "flat":
|
||||
path = ""
|
||||
filename = slugify(filename) # Ensure filename is slugified
|
||||
elif path_generator == "url":
|
||||
path = slugify(url)
|
||||
elif path_generator == "random":
|
||||
path = ArchivingContext.get("random_path", random_str(24), True)
|
||||
path = self.config.get("random_path", random_str(24), True)
|
||||
else:
|
||||
raise ValueError(f"Invalid path_generator: {path_generator}")
|
||||
|
||||
# Handle filename_generator logic
|
||||
filename_generator = ArchivingContext.get("filename_generator", "random")
|
||||
filename_generator = self.config.get("filename_generator", "random")
|
||||
if filename_generator == "random":
|
||||
filename = random_str(24)
|
||||
elif filename_generator == "static":
|
||||
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
|
||||
# load the hash_enricher module
|
||||
he = get_module(HashEnricher, self.config)
|
||||
hd = he.calculate_hash(media.filename)
|
||||
filename = hd[:24]
|
||||
else:
|
||||
|
|
|
@ -2,7 +2,7 @@ from loguru import logger
|
|||
import csv
|
||||
|
||||
from . import Feeder
|
||||
from ..core import Metadata, ArchivingContext
|
||||
from ..core import Metadata
|
||||
from ..utils import url_or_none
|
||||
|
||||
class CSVFeeder(Feeder):
|
||||
|
@ -34,5 +34,4 @@ class CSVFeeder(Feeder):
|
|||
for row in reader:
|
||||
url = row[0]
|
||||
logger.debug(f"Processing {url}")
|
||||
yield Metadata().set_url(url)
|
||||
ArchivingContext.set("folder", "cli")
|
||||
yield Metadata().set_url(url)
|
|
@ -2,7 +2,7 @@ from loguru import logger
|
|||
import csv
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.core import Metadata
|
||||
from auto_archiver.utils import url_or_none
|
||||
|
||||
class CSVFeeder(Feeder):
|
||||
|
@ -19,5 +19,4 @@ class CSVFeeder(Feeder):
|
|||
for row in reader:
|
||||
url = row[0]
|
||||
logger.debug(f"Processing {url}")
|
||||
yield Metadata().set_url(url)
|
||||
ArchivingContext.set("folder", "cli")
|
||||
yield Metadata().set_url(url)
|
|
@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.core.extractor import Extractor
|
||||
from ...core import Metadata, Media, ArchivingContext
|
||||
from ...core import Metadata, Media
|
||||
|
||||
class GenericExtractor(Extractor):
|
||||
_dropins = {}
|
||||
|
|
|
@ -6,7 +6,7 @@ from urllib.parse import quote
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Database
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
||||
|
||||
|
||||
|
@ -93,8 +93,7 @@ class GsheetsDb(Database):
|
|||
logger.debug(f"Unable to update sheet: {e}")
|
||||
|
||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
|
||||
if gsheet := ArchivingContext.get("gsheet"):
|
||||
if gsheet := item.get_context("gsheet"):
|
||||
gw: GWorksheet = gsheet.get("worksheet")
|
||||
row: int = gsheet.get("row")
|
||||
elif self.sheet_id:
|
||||
|
|
|
@ -15,7 +15,7 @@ from loguru import logger
|
|||
from slugify import slugify
|
||||
|
||||
from auto_archiver.core import Feeder
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.core import Metadata
|
||||
from . import GWorksheet
|
||||
|
||||
|
||||
|
@ -60,17 +60,15 @@ class GsheetsFeeder(Feeder):
|
|||
|
||||
# All checks done - archival process starts here
|
||||
m = Metadata().set_url(url)
|
||||
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
|
||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||
folder = ''
|
||||
else:
|
||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||
if len(folder):
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
|
||||
else:
|
||||
ArchivingContext.set("folder", folder, True)
|
||||
if len(folder) and self.use_sheet_names_in_stored_paths:
|
||||
folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title))
|
||||
|
||||
m.set_context('folder', folder)
|
||||
m.set_context('worksheet', {"row": row, "worksheet": gw})
|
||||
yield m
|
||||
|
||||
logger.success(f'Finished worksheet {wks.title}')
|
||||
|
|
|
@ -11,7 +11,7 @@ import hashlib
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext
|
||||
from auto_archiver.core import Metadata
|
||||
|
||||
|
||||
class HashEnricher(Enricher):
|
||||
|
|
|
@ -16,7 +16,7 @@ from loguru import logger
|
|||
from telethon.sync import TelegramClient
|
||||
|
||||
from auto_archiver.core import Extractor
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.utils import random_str
|
||||
|
||||
|
||||
|
@ -61,7 +61,7 @@ class InstagramTbotExtractor(Extractor):
|
|||
if not "instagram.com" in url: return False
|
||||
|
||||
result = Metadata()
|
||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
||||
tmp_dir = self.tmp_dir
|
||||
with self.client.start():
|
||||
chat = self.client.get_entity("instagram_load_bot")
|
||||
since_id = self.client.send_message(entity=chat, message=url).id
|
||||
|
|
|
@ -4,7 +4,7 @@ from urllib.parse import urlparse
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, ArchivingContext, Media
|
||||
from auto_archiver.core import Metadata, Media
|
||||
|
||||
|
||||
class SSLEnricher(Enricher):
|
||||
|
|
|
@ -3,7 +3,7 @@ import requests, time
|
|||
from loguru import logger
|
||||
|
||||
from auto_archiver.core import Enricher
|
||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
||||
from auto_archiver.core import Metadata, Media
|
||||
from auto_archiver.modules.s3_storage import S3Storage
|
||||
from auto_archiver.core.module import get_module
|
||||
|
||||
|
@ -25,7 +25,7 @@ class WhisperEnricher(Enricher):
|
|||
job_results = {}
|
||||
for i, m in enumerate(to_enrich.media):
|
||||
if m.is_video() or m.is_audio():
|
||||
m.store(url=url, metadata=to_enrich)
|
||||
m.store(url=url, metadata=to_enrich, storages=self.storages)
|
||||
try:
|
||||
job_id = self.submit_job(m)
|
||||
job_results[job_id] = False
|
||||
|
@ -110,7 +110,7 @@ class WhisperEnricher(Enricher):
|
|||
|
||||
def _get_s3_storage(self) -> S3Storage:
|
||||
try:
|
||||
return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
|
||||
return next(s for s in self.storages if s.__class__ == S3Storage)
|
||||
except:
|
||||
logger.warning("No S3Storage instance found in storages")
|
||||
return
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
import tempfile
|
||||
|
||||
from auto_archiver.core.context import ArchivingContext
|
||||
|
||||
ArchivingContext.reset(full_reset=True)
|
Ładowanie…
Reference in New Issue