kopia lustrzana https://github.com/bellingcat/auto-archiver
Remove ArchivingContext completely
Context for a specific url/item is now passed around via the metadata (metadata.set_context('key', 'val') and metadata.get_context('key', default='something') The only other thing that was passed around in ArchivingContext was the storage info, which is already accessible now via self.configpull/189/head
rodzic
d76063c3f3
commit
c25d5cae84
|
@ -4,7 +4,6 @@
|
||||||
from .metadata import Metadata
|
from .metadata import Metadata
|
||||||
from .media import Media
|
from .media import Media
|
||||||
from .module import BaseModule
|
from .module import BaseModule
|
||||||
from .context import ArchivingContext
|
|
||||||
|
|
||||||
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
# cannot import ArchivingOrchestrator/Config to avoid circular dep
|
||||||
# from .orchestrator import ArchivingOrchestrator
|
# from .orchestrator import ArchivingOrchestrator
|
||||||
|
|
|
@ -56,6 +56,10 @@ class BaseModule(ABC):
|
||||||
# this is set by the orchestrator prior to archiving
|
# this is set by the orchestrator prior to archiving
|
||||||
tmp_dir: TemporaryDirectory = None
|
tmp_dir: TemporaryDirectory = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def storages(self) -> list:
|
||||||
|
return self.config.get('storages', [])
|
||||||
|
|
||||||
def setup(self, config: dict):
|
def setup(self, config: dict):
|
||||||
|
|
||||||
authentication = config.get('authentication', {})
|
authentication = config.get('authentication', {})
|
||||||
|
@ -75,9 +79,6 @@ class BaseModule(ABC):
|
||||||
self.config = config
|
self.config = config
|
||||||
for key, val in config.get(self.name, {}).items():
|
for key, val in config.get(self.name, {}).items():
|
||||||
setattr(self, key, val)
|
setattr(self, key, val)
|
||||||
|
|
||||||
def repr(self):
|
|
||||||
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
|
|
||||||
|
|
||||||
def auth_for_site(self, site: str) -> dict:
|
def auth_for_site(self, site: str) -> dict:
|
||||||
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
|
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
|
||||||
|
@ -97,4 +98,7 @@ class BaseModule(ABC):
|
||||||
did find information for '{key}' which is close, is this what you meant? \
|
did find information for '{key}' which is close, is this what you meant? \
|
||||||
If so, edit your authentication settings to make sure it exactly matches.")
|
If so, edit your authentication settings to make sure it exactly matches.")
|
||||||
|
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
def repr(self):
|
||||||
|
return f"Module<'{self.display_name}' (config: {self.config[self.name]})>"
|
|
@ -1,56 +0,0 @@
|
||||||
""" ArchivingContext provides a global context for managing configurations and temporary data during the archiving process.
|
|
||||||
|
|
||||||
This singleton class allows for:
|
|
||||||
- Storing and retrieving key-value pairs that are accessible throughout the application lifecycle.
|
|
||||||
- Marking certain values to persist across resets using `keep_on_reset`.
|
|
||||||
- Managing temporary directories and other shared data used during the archiving process.
|
|
||||||
|
|
||||||
### Key Features:
|
|
||||||
- Creates a single global instance.
|
|
||||||
- Reset functionality allows for clearing configurations, with options for partial or full resets.
|
|
||||||
- Custom getters and setters for commonly used context values like temporary directories.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
class ArchivingContext:
|
|
||||||
"""
|
|
||||||
Singleton context class for managing global configurations and temporary data.
|
|
||||||
|
|
||||||
ArchivingContext._get_instance() to retrieve it if needed
|
|
||||||
otherwise just
|
|
||||||
ArchivingContext.set(key, value)
|
|
||||||
and
|
|
||||||
ArchivingContext.get(key, default)
|
|
||||||
|
|
||||||
When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
|
|
||||||
reset(full_reset=True) will recreate everything including the keep_on_reset status
|
|
||||||
"""
|
|
||||||
_instance = None
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.configs = {}
|
|
||||||
self.keep_on_reset = set()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_instance():
|
|
||||||
if ArchivingContext._instance is None:
|
|
||||||
ArchivingContext._instance = ArchivingContext()
|
|
||||||
return ArchivingContext._instance
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def set(key, value, keep_on_reset: bool = False):
|
|
||||||
ac = ArchivingContext.get_instance()
|
|
||||||
ac.configs[key] = value
|
|
||||||
if keep_on_reset: ac.keep_on_reset.add(key)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get(key: str, default=None):
|
|
||||||
return ArchivingContext.get_instance().configs.get(key, default)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def reset(full_reset: bool = False):
|
|
||||||
ac = ArchivingContext.get_instance()
|
|
||||||
if full_reset: ac.keep_on_reset = set()
|
|
||||||
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
|
|
||||||
|
|
||||||
# ---- custom getters/setters for widely used context values
|
|
|
@ -17,7 +17,7 @@ from loguru import logger
|
||||||
from retrying import retry
|
from retrying import retry
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ..core import Metadata, ArchivingContext, BaseModule
|
from ..core import Metadata, BaseModule
|
||||||
|
|
||||||
|
|
||||||
class Extractor(BaseModule):
|
class Extractor(BaseModule):
|
||||||
|
|
|
@ -11,8 +11,6 @@ from dataclasses import dataclass, field
|
||||||
from dataclasses_json import dataclass_json, config
|
from dataclasses_json import dataclass_json, config
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
from .context import ArchivingContext
|
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
|
|
||||||
|
@ -36,12 +34,11 @@ class Media:
|
||||||
_mimetype: str = None # eg: image/jpeg
|
_mimetype: str = None # eg: image/jpeg
|
||||||
_stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude
|
_stored: bool = field(default=False, repr=False, metadata=config(exclude=lambda _: True)) # always exclude
|
||||||
|
|
||||||
def store(self: Media, override_storages: List = None, url: str = "url-not-available", metadata: Any = None):
|
def store(self: Media, metadata: Any, url: str = "url-not-available", storages: List[Any] = None) -> None:
|
||||||
# 'Any' typing for metadata to avoid circular imports. Stores the media
|
# 'Any' typing for metadata to avoid circular imports. Stores the media
|
||||||
# into the provided/available storages [Storage] repeats the process for
|
# into the provided/available storages [Storage] repeats the process for
|
||||||
# its properties, in case they have inner media themselves for now it
|
# its properties, in case they have inner media themselves for now it
|
||||||
# only goes down 1 level but it's easy to make it recursive if needed.
|
# only goes down 1 level but it's easy to make it recursive if needed.
|
||||||
storages = override_storages or ArchivingContext.get("storages")
|
|
||||||
if not len(storages):
|
if not len(storages):
|
||||||
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
|
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
|
||||||
return
|
return
|
||||||
|
@ -66,8 +63,9 @@ class Media:
|
||||||
for inner_media in prop_media.all_inner_media(include_self=True):
|
for inner_media in prop_media.all_inner_media(include_self=True):
|
||||||
yield inner_media
|
yield inner_media
|
||||||
|
|
||||||
def is_stored(self) -> bool:
|
def is_stored(self, in_storage) -> bool:
|
||||||
return len(self.urls) > 0 and len(self.urls) == len(ArchivingContext.get("storages"))
|
# checks if the media is already stored in the given storage
|
||||||
|
return len(self.urls) > 0 and any([u for u in self.urls if in_storage.get_cdn_url() in u])
|
||||||
|
|
||||||
def set(self, key: str, value: Any) -> Media:
|
def set(self, key: str, value: Any) -> Media:
|
||||||
self.properties[key] = value
|
self.properties[key] = value
|
||||||
|
|
|
@ -20,8 +20,6 @@ from dateutil.parser import parse as parse_dt
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from .media import Media
|
from .media import Media
|
||||||
from .context import ArchivingContext
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass_json # annotation order matters
|
@dataclass_json # annotation order matters
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -32,6 +30,7 @@ class Metadata:
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc))
|
self.set("_processed_at", datetime.datetime.now(datetime.timezone.utc))
|
||||||
|
self._context = {}
|
||||||
|
|
||||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||||
"""
|
"""
|
||||||
|
@ -57,12 +56,11 @@ class Metadata:
|
||||||
return right.merge(self)
|
return right.merge(self)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def store(self: Metadata, override_storages: List = None):
|
def store(self, storages=[]):
|
||||||
# calls .store for all contained media. storages [Storage]
|
# calls .store for all contained media. storages [Storage]
|
||||||
self.remove_duplicate_media_by_hash()
|
self.remove_duplicate_media_by_hash()
|
||||||
storages = override_storages or ArchivingContext.get("storages")
|
|
||||||
for media in self.media:
|
for media in self.media:
|
||||||
media.store(override_storages=storages, url=self.get_url(), metadata=self)
|
media.store(url=self.get_url(), metadata=self, storages=storages)
|
||||||
|
|
||||||
def set(self, key: str, val: Any) -> Metadata:
|
def set(self, key: str, val: Any) -> Metadata:
|
||||||
self.metadata[key] = val
|
self.metadata[key] = val
|
||||||
|
@ -206,3 +204,10 @@ class Metadata:
|
||||||
if len(r.media) > len(most_complete.media): most_complete = r
|
if len(r.media) > len(most_complete.media): most_complete = r
|
||||||
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
|
elif len(r.media) == len(most_complete.media) and len(r.metadata) > len(most_complete.metadata): most_complete = r
|
||||||
return most_complete
|
return most_complete
|
||||||
|
|
||||||
|
def set_context(self, key: str, val: Any) -> Metadata:
|
||||||
|
self._context[key] = val
|
||||||
|
return self
|
||||||
|
|
||||||
|
def get_context(self, key: str, default: Any = None) -> Any:
|
||||||
|
return self._context.get(key, default)
|
|
@ -43,7 +43,6 @@ def setup_paths(paths: list[str]) -> None:
|
||||||
# sort based on the length of the path, so that the longest path is last in the list
|
# sort based on the length of the path, so that the longest path is last in the list
|
||||||
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
|
auto_archiver.modules.__path__ = sorted(auto_archiver.modules.__path__, key=len, reverse=True)
|
||||||
|
|
||||||
|
|
||||||
def get_module(module_name: str, config: dict) -> BaseModule:
|
def get_module(module_name: str, config: dict) -> BaseModule:
|
||||||
"""
|
"""
|
||||||
Gets and sets up a module using the provided config
|
Gets and sets up a module using the provided config
|
||||||
|
@ -69,6 +68,7 @@ def get_module_lazy(module_name: str, suppress_warnings: bool = False) -> LazyBa
|
||||||
return module
|
return module
|
||||||
|
|
||||||
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
def available_modules(with_manifest: bool=False, limit_to_modules: List[str]= [], suppress_warnings: bool = False) -> List[LazyBaseModule]:
|
||||||
|
|
||||||
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
# search through all valid 'modules' paths. Default is 'modules' in the current directory
|
||||||
|
|
||||||
# see odoo/modules/module.py -> get_modules
|
# see odoo/modules/module.py -> get_modules
|
||||||
|
|
|
@ -17,9 +17,8 @@ import traceback
|
||||||
|
|
||||||
from rich_argparse import RichHelpFormatter
|
from rich_argparse import RichHelpFormatter
|
||||||
|
|
||||||
from .context import ArchivingContext
|
|
||||||
|
|
||||||
from .metadata import Metadata
|
from .metadata import Metadata, Media
|
||||||
from ..version import __version__
|
from ..version import __version__
|
||||||
from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
from .config import yaml, read_yaml, store_yaml, to_dot_notation, merge_dicts, EMPTY_CONFIG, DefaultValidatingParser
|
||||||
from .module import available_modules, LazyBaseModule, get_module, setup_paths
|
from .module import available_modules, LazyBaseModule, get_module, setup_paths
|
||||||
|
@ -268,7 +267,6 @@ class ArchivingOrchestrator:
|
||||||
for url in urls:
|
for url in urls:
|
||||||
logger.debug(f"Processing URL: '{url}'")
|
logger.debug(f"Processing URL: '{url}'")
|
||||||
yield Metadata().set_url(url)
|
yield Metadata().set_url(url)
|
||||||
ArchivingContext.set("folder", "cli")
|
|
||||||
|
|
||||||
pseudo_module = type('CLIFeeder', (Feeder,), {
|
pseudo_module = type('CLIFeeder', (Feeder,), {
|
||||||
'name': 'cli_feeder',
|
'name': 'cli_feeder',
|
||||||
|
@ -297,9 +295,6 @@ class ArchivingOrchestrator:
|
||||||
continue
|
continue
|
||||||
if loaded_module:
|
if loaded_module:
|
||||||
step_items.append(loaded_module)
|
step_items.append(loaded_module)
|
||||||
# TODO temp solution
|
|
||||||
if module_type == "storage":
|
|
||||||
ArchivingContext.set("storages", step_items, keep_on_reset=True)
|
|
||||||
|
|
||||||
check_steps_ok()
|
check_steps_ok()
|
||||||
self.config['steps'][f"{module_type}s"] = step_items
|
self.config['steps'][f"{module_type}s"] = step_items
|
||||||
|
@ -449,11 +444,12 @@ class ArchivingOrchestrator:
|
||||||
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
logger.error(f"ERROR enricher {e.name}: {exc}: {traceback.format_exc()}")
|
||||||
|
|
||||||
# 5 - store all downloaded/generated media
|
# 5 - store all downloaded/generated media
|
||||||
result.store()
|
result.store(storages=self.storages)
|
||||||
|
|
||||||
# 6 - format and store formatted if needed
|
# 6 - format and store formatted if needed
|
||||||
|
final_media: Media
|
||||||
if final_media := self.formatters[0].format(result):
|
if final_media := self.formatters[0].format(result):
|
||||||
final_media.store(url=url, metadata=result)
|
final_media.store(url=url, metadata=result, storages=self.storages)
|
||||||
result.set_final_media(final_media)
|
result.set_final_media(final_media)
|
||||||
|
|
||||||
if result.is_empty():
|
if result.is_empty():
|
||||||
|
|
|
@ -8,16 +8,16 @@ from slugify import slugify
|
||||||
|
|
||||||
from auto_archiver.utils.misc import random_str
|
from auto_archiver.utils.misc import random_str
|
||||||
|
|
||||||
from auto_archiver.core import Media, BaseModule, ArchivingContext, Metadata
|
from auto_archiver.core import Media, BaseModule, Metadata
|
||||||
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
from auto_archiver.modules.hash_enricher.hash_enricher import HashEnricher
|
||||||
|
from auto_archiver.core.module import get_module
|
||||||
class Storage(BaseModule):
|
class Storage(BaseModule):
|
||||||
|
|
||||||
def store(self, media: Media, url: str, metadata: Optional[Metadata]=None) -> None:
|
def store(self, media: Media, url: str, metadata: Metadata=None) -> None:
|
||||||
if media.is_stored():
|
if media.is_stored(in_storage=self):
|
||||||
logger.debug(f"{media.key} already stored, skipping")
|
logger.debug(f"{media.key} already stored, skipping")
|
||||||
return
|
return
|
||||||
self.set_key(media, url)
|
self.set_key(media, url, metadata)
|
||||||
self.upload(media, metadata=metadata)
|
self.upload(media, metadata=metadata)
|
||||||
media.add_url(self.get_cdn_url(media))
|
media.add_url(self.get_cdn_url(media))
|
||||||
|
|
||||||
|
@ -32,30 +32,31 @@ class Storage(BaseModule):
|
||||||
with open(media.filename, 'rb') as f:
|
with open(media.filename, 'rb') as f:
|
||||||
return self.uploadf(f, media, **kwargs)
|
return self.uploadf(f, media, **kwargs)
|
||||||
|
|
||||||
def set_key(self, media: Media, url) -> None:
|
def set_key(self, media: Media, url, metadata: Metadata) -> None:
|
||||||
"""takes the media and optionally item info and generates a key"""
|
"""takes the media and optionally item info and generates a key"""
|
||||||
if media.key is not None and len(media.key) > 0: return
|
if media.key is not None and len(media.key) > 0: return
|
||||||
folder = ArchivingContext.get("folder", "")
|
folder = metadata.folder
|
||||||
filename, ext = os.path.splitext(media.filename)
|
filename, ext = os.path.splitext(media.filename)
|
||||||
|
|
||||||
# Handle path_generator logic
|
# Handle path_generator logic
|
||||||
path_generator = ArchivingContext.get("path_generator", "url")
|
path_generator = self.config.get("path_generator", "url")
|
||||||
if path_generator == "flat":
|
if path_generator == "flat":
|
||||||
path = ""
|
path = ""
|
||||||
filename = slugify(filename) # Ensure filename is slugified
|
filename = slugify(filename) # Ensure filename is slugified
|
||||||
elif path_generator == "url":
|
elif path_generator == "url":
|
||||||
path = slugify(url)
|
path = slugify(url)
|
||||||
elif path_generator == "random":
|
elif path_generator == "random":
|
||||||
path = ArchivingContext.get("random_path", random_str(24), True)
|
path = self.config.get("random_path", random_str(24), True)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid path_generator: {path_generator}")
|
raise ValueError(f"Invalid path_generator: {path_generator}")
|
||||||
|
|
||||||
# Handle filename_generator logic
|
# Handle filename_generator logic
|
||||||
filename_generator = ArchivingContext.get("filename_generator", "random")
|
filename_generator = self.config.get("filename_generator", "random")
|
||||||
if filename_generator == "random":
|
if filename_generator == "random":
|
||||||
filename = random_str(24)
|
filename = random_str(24)
|
||||||
elif filename_generator == "static":
|
elif filename_generator == "static":
|
||||||
he = HashEnricher({"hash_enricher": {"algorithm": ArchivingContext.get("hash_enricher.algorithm"), "chunksize": 1.6e7}})
|
# load the hash_enricher module
|
||||||
|
he = get_module(HashEnricher, self.config)
|
||||||
hd = he.calculate_hash(media.filename)
|
hd = he.calculate_hash(media.filename)
|
||||||
filename = hd[:24]
|
filename = hd[:24]
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -2,7 +2,7 @@ from loguru import logger
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
from . import Feeder
|
from . import Feeder
|
||||||
from ..core import Metadata, ArchivingContext
|
from ..core import Metadata
|
||||||
from ..utils import url_or_none
|
from ..utils import url_or_none
|
||||||
|
|
||||||
class CSVFeeder(Feeder):
|
class CSVFeeder(Feeder):
|
||||||
|
@ -34,5 +34,4 @@ class CSVFeeder(Feeder):
|
||||||
for row in reader:
|
for row in reader:
|
||||||
url = row[0]
|
url = row[0]
|
||||||
logger.debug(f"Processing {url}")
|
logger.debug(f"Processing {url}")
|
||||||
yield Metadata().set_url(url)
|
yield Metadata().set_url(url)
|
||||||
ArchivingContext.set("folder", "cli")
|
|
|
@ -2,7 +2,7 @@ from loguru import logger
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
from auto_archiver.core import Feeder
|
from auto_archiver.core import Feeder
|
||||||
from auto_archiver.core import Metadata, ArchivingContext
|
from auto_archiver.core import Metadata
|
||||||
from auto_archiver.utils import url_or_none
|
from auto_archiver.utils import url_or_none
|
||||||
|
|
||||||
class CSVFeeder(Feeder):
|
class CSVFeeder(Feeder):
|
||||||
|
@ -19,5 +19,4 @@ class CSVFeeder(Feeder):
|
||||||
for row in reader:
|
for row in reader:
|
||||||
url = row[0]
|
url = row[0]
|
||||||
logger.debug(f"Processing {url}")
|
logger.debug(f"Processing {url}")
|
||||||
yield Metadata().set_url(url)
|
yield Metadata().set_url(url)
|
||||||
ArchivingContext.set("folder", "cli")
|
|
|
@ -6,7 +6,7 @@ from yt_dlp.extractor.common import InfoExtractor
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from auto_archiver.core.extractor import Extractor
|
from auto_archiver.core.extractor import Extractor
|
||||||
from ...core import Metadata, Media, ArchivingContext
|
from ...core import Metadata, Media
|
||||||
|
|
||||||
class GenericExtractor(Extractor):
|
class GenericExtractor(Extractor):
|
||||||
_dropins = {}
|
_dropins = {}
|
||||||
|
|
|
@ -6,7 +6,7 @@ from urllib.parse import quote
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from auto_archiver.core import Database
|
from auto_archiver.core import Database
|
||||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
from auto_archiver.core import Metadata, Media
|
||||||
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
from auto_archiver.modules.gsheet_feeder import GWorksheet
|
||||||
|
|
||||||
|
|
||||||
|
@ -93,8 +93,7 @@ class GsheetsDb(Database):
|
||||||
logger.debug(f"Unable to update sheet: {e}")
|
logger.debug(f"Unable to update sheet: {e}")
|
||||||
|
|
||||||
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
|
||||||
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
|
if gsheet := item.get_context("gsheet"):
|
||||||
if gsheet := ArchivingContext.get("gsheet"):
|
|
||||||
gw: GWorksheet = gsheet.get("worksheet")
|
gw: GWorksheet = gsheet.get("worksheet")
|
||||||
row: int = gsheet.get("row")
|
row: int = gsheet.get("row")
|
||||||
elif self.sheet_id:
|
elif self.sheet_id:
|
||||||
|
|
|
@ -15,7 +15,7 @@ from loguru import logger
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
|
||||||
from auto_archiver.core import Feeder
|
from auto_archiver.core import Feeder
|
||||||
from auto_archiver.core import Metadata, ArchivingContext
|
from auto_archiver.core import Metadata
|
||||||
from . import GWorksheet
|
from . import GWorksheet
|
||||||
|
|
||||||
|
|
||||||
|
@ -60,17 +60,15 @@ class GsheetsFeeder(Feeder):
|
||||||
|
|
||||||
# All checks done - archival process starts here
|
# All checks done - archival process starts here
|
||||||
m = Metadata().set_url(url)
|
m = Metadata().set_url(url)
|
||||||
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
|
|
||||||
if gw.get_cell_or_default(row, 'folder', "") is None:
|
if gw.get_cell_or_default(row, 'folder', "") is None:
|
||||||
folder = ''
|
folder = ''
|
||||||
else:
|
else:
|
||||||
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
folder = slugify(gw.get_cell_or_default(row, 'folder', "").strip())
|
||||||
if len(folder):
|
if len(folder) and self.use_sheet_names_in_stored_paths:
|
||||||
if self.use_sheet_names_in_stored_paths:
|
folder = os.path.join(folder, slugify(self.sheet), slugify(wks.title))
|
||||||
ArchivingContext.set("folder", os.path.join(folder, slugify(self.sheet), slugify(wks.title)), True)
|
|
||||||
else:
|
|
||||||
ArchivingContext.set("folder", folder, True)
|
|
||||||
|
|
||||||
|
m.set_context('folder', folder)
|
||||||
|
m.set_context('worksheet', {"row": row, "worksheet": gw})
|
||||||
yield m
|
yield m
|
||||||
|
|
||||||
logger.success(f'Finished worksheet {wks.title}')
|
logger.success(f'Finished worksheet {wks.title}')
|
||||||
|
|
|
@ -11,7 +11,7 @@ import hashlib
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
from auto_archiver.core import Metadata, ArchivingContext
|
from auto_archiver.core import Metadata
|
||||||
|
|
||||||
|
|
||||||
class HashEnricher(Enricher):
|
class HashEnricher(Enricher):
|
||||||
|
|
|
@ -16,7 +16,7 @@ from loguru import logger
|
||||||
from telethon.sync import TelegramClient
|
from telethon.sync import TelegramClient
|
||||||
|
|
||||||
from auto_archiver.core import Extractor
|
from auto_archiver.core import Extractor
|
||||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
from auto_archiver.core import Metadata, Media
|
||||||
from auto_archiver.utils import random_str
|
from auto_archiver.utils import random_str
|
||||||
|
|
||||||
|
|
||||||
|
@ -61,7 +61,7 @@ class InstagramTbotExtractor(Extractor):
|
||||||
if not "instagram.com" in url: return False
|
if not "instagram.com" in url: return False
|
||||||
|
|
||||||
result = Metadata()
|
result = Metadata()
|
||||||
tmp_dir = ArchivingContext.get_tmp_dir()
|
tmp_dir = self.tmp_dir
|
||||||
with self.client.start():
|
with self.client.start():
|
||||||
chat = self.client.get_entity("instagram_load_bot")
|
chat = self.client.get_entity("instagram_load_bot")
|
||||||
since_id = self.client.send_message(entity=chat, message=url).id
|
since_id = self.client.send_message(entity=chat, message=url).id
|
||||||
|
|
|
@ -4,7 +4,7 @@ from urllib.parse import urlparse
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
from auto_archiver.core import Metadata, ArchivingContext, Media
|
from auto_archiver.core import Metadata, Media
|
||||||
|
|
||||||
|
|
||||||
class SSLEnricher(Enricher):
|
class SSLEnricher(Enricher):
|
||||||
|
|
|
@ -3,7 +3,7 @@ import requests, time
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from auto_archiver.core import Enricher
|
from auto_archiver.core import Enricher
|
||||||
from auto_archiver.core import Metadata, Media, ArchivingContext
|
from auto_archiver.core import Metadata, Media
|
||||||
from auto_archiver.modules.s3_storage import S3Storage
|
from auto_archiver.modules.s3_storage import S3Storage
|
||||||
from auto_archiver.core.module import get_module
|
from auto_archiver.core.module import get_module
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ class WhisperEnricher(Enricher):
|
||||||
job_results = {}
|
job_results = {}
|
||||||
for i, m in enumerate(to_enrich.media):
|
for i, m in enumerate(to_enrich.media):
|
||||||
if m.is_video() or m.is_audio():
|
if m.is_video() or m.is_audio():
|
||||||
m.store(url=url, metadata=to_enrich)
|
m.store(url=url, metadata=to_enrich, storages=self.storages)
|
||||||
try:
|
try:
|
||||||
job_id = self.submit_job(m)
|
job_id = self.submit_job(m)
|
||||||
job_results[job_id] = False
|
job_results[job_id] = False
|
||||||
|
@ -110,7 +110,7 @@ class WhisperEnricher(Enricher):
|
||||||
|
|
||||||
def _get_s3_storage(self) -> S3Storage:
|
def _get_s3_storage(self) -> S3Storage:
|
||||||
try:
|
try:
|
||||||
return next(s for s in ArchivingContext.get("storages") if s.__class__ == S3Storage)
|
return next(s for s in self.storages if s.__class__ == S3Storage)
|
||||||
except:
|
except:
|
||||||
logger.warning("No S3Storage instance found in storages")
|
logger.warning("No S3Storage instance found in storages")
|
||||||
return
|
return
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
import tempfile
|
|
||||||
|
|
||||||
from auto_archiver.core.context import ArchivingContext
|
|
||||||
|
|
||||||
ArchivingContext.reset(full_reset=True)
|
|
Ładowanie…
Reference in New Issue