From 6f6eb2db7a96e21778dea93e607f5d9322022aca Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 23 Mar 2023 14:28:45 +0000 Subject: [PATCH] Archiving Context refactor complete --- src/auto_archiver/core/__init__.py | 2 +- src/auto_archiver/core/context.py | 19 +++++++++- src/auto_archiver/core/media.py | 37 ++++++++++++++----- src/auto_archiver/core/metadata.py | 13 +++++-- src/auto_archiver/core/orchestrator.py | 17 ++------- src/auto_archiver/databases/gsheet_db.py | 9 ++--- src/auto_archiver/feeders/cli_feeder.py | 6 ++- src/auto_archiver/feeders/gsheet_feeder.py | 12 +++--- .../formatters/templates/html_template.html | 8 +++- .../formatters/templates/macros.html | 10 ++++- src/auto_archiver/storages/storage.py | 18 ++++----- 11 files changed, 96 insertions(+), 55 deletions(-) diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py index d9a04bd..99765c7 100644 --- a/src/auto_archiver/core/__init__.py +++ b/src/auto_archiver/core/__init__.py @@ -1,5 +1,5 @@ -from .media import Media from .metadata import Metadata +from .media import Media from .step import Step from .context import ArchivingContext diff --git a/src/auto_archiver/core/context.py b/src/auto_archiver/core/context.py index c1709e7..fe06e41 100644 --- a/src/auto_archiver/core/context.py +++ b/src/auto_archiver/core/context.py @@ -1,3 +1,5 @@ +from loguru import logger + class ArchivingContext: """ @@ -7,11 +9,15 @@ class ArchivingContext: ArchivingContext.set(key, value) and ArchivingContext.get(key, default) + + When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True) + reset(full_reset=True) will recreate everything including the keep_on_reset status """ _instance = None def __init__(self): self.configs = {} + self.keep_on_reset = set() @staticmethod def get_instance(): @@ -20,13 +26,22 @@ class ArchivingContext: return ArchivingContext._instance @staticmethod - def set(key, value): - ArchivingContext.get_instance().configs[key] = value + def set(key, value, keep_on_reset: bool = False): + logger.error(f"SET [{key}]={value}") + ac = ArchivingContext.get_instance() + ac.configs[key] = value + if keep_on_reset: ac.keep_on_reset.add(key) @staticmethod def get(key: str, default=None): return ArchivingContext.get_instance().configs.get(key, default) + @staticmethod + def reset(full_reset: bool = False): + ac = ArchivingContext.get_instance() + if full_reset: ac.keep_on_reset = set() + ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset} + # ---- custom getters/setters for widely used context values @staticmethod diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py index 53f2a0b..9f15bdc 100644 --- a/src/auto_archiver/core/media.py +++ b/src/auto_archiver/core/media.py @@ -3,19 +3,43 @@ from __future__ import annotations from ast import List from typing import Any from dataclasses import dataclass, field -from dataclasses_json import dataclass_json +from dataclasses_json import dataclass_json, config import mimetypes +from .context import ArchivingContext + +from loguru import logger -@dataclass_json # annotation order matters +@dataclass_json # annotation order matters @dataclass class Media: filename: str key: str = None urls: List[str] = field(default_factory=list) - _mimetype: str = None # eg: image/jpeg properties: dict = field(default_factory=dict) + _mimetype: str = None # eg: image/jpeg + _stored: bool = field(default=False, repr=False, metadata=config(exclude=True)) + + def store(self: Media, override_storages: List = None, url: str = "url-not-available"): + # stores the media into the provided/available storages [Storage] + # repeats the process for its properties, in case they have inner media themselves + # for now it only goes down 1 level but it's easy to make it recursive if needed + storages = override_storages or ArchivingContext.get("storages") + if not len(storages): + logger.warning(f"No storages found in local context or provided directly for {self.filename}.") + return + + for s in storages: + s.store(self, url) + # Media can be inside media properties, examples include transformations on original media + for prop in self.properties.values(): + if isinstance(prop, Media): + s.store(prop, url) + if isinstance(prop, list): + for prop_media in prop: + if isinstance(prop_media, Media): + s.store(prop_media, url) def set(self, key: str, value: Any) -> Media: self.properties[key] = value @@ -44,10 +68,3 @@ class Media: def is_audio(self) -> bool: return self.mimetype.startswith("audio") - - def store(self): - """ - either stores this media entry and all its media descendants - or returns if that process is already completed - """ - pass diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 2ae583d..09791bf 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -8,9 +8,10 @@ import datetime from urllib.parse import urlparse from dateutil.parser import parse as parse_dt from .media import Media +from .context import ArchivingContext -# annotation order matters -@dataclass_json + +@dataclass_json # annotation order matters @dataclass class Metadata: status: str = "no archiver" @@ -23,7 +24,6 @@ class Metadata: tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata=config(exclude=True)) # tmp_metadata: Dict[str, Any] = field(default_factory=dict, repr=False, metadata=config(exclude=True)) # contains internal properties not to be leaked when .to_json/repr/str is called - def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: """ merges two Metadata instances, will overwrite according to overwrite_left flag @@ -46,6 +46,12 @@ class Metadata: return right.merge(self) return self + def store(self: Metadata, override_storages: List = None): + # calls .store for all contained media. storages [Storage] + storages = override_storages or ArchivingContext.get("storages") + for media in self.media: + media.store(override_storages=storages) + def set(self, key: str, val: Any, is_tmp=False) -> Metadata: # if not self.metadata: self.metadata = {} self.metadata[key] = val @@ -144,4 +150,3 @@ class Metadata: def __str__(self) -> str: return self.__repr__() - \ No newline at end of file diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py index 03339fc..1835ae2 100644 --- a/src/auto_archiver/core/orchestrator.py +++ b/src/auto_archiver/core/orchestrator.py @@ -25,7 +25,7 @@ class ArchivingOrchestrator: self.archivers: List[Archiver] = config.archivers self.databases: List[Database] = config.databases self.storages: List[Storage] = config.storages - ArchivingContext.set("storages", self.storages) + ArchivingContext.set("storages", self.storages, keep_on_reset=True) for a in self.archivers: a.setup() @@ -35,6 +35,7 @@ class ArchivingOrchestrator: def feed_item(self, item: Metadata) -> Metadata: try: + ArchivingContext.reset() with tempfile.TemporaryDirectory(dir="./") as tmp_dir: ArchivingContext.set_tmp_dir(tmp_dir) return self.archive(item) @@ -108,22 +109,12 @@ class ArchivingOrchestrator: # 5 - store media # looks for Media in result.media and also result.media[x].properties (as list or dict values) - for s in self.storages: - for m in result.media: - s.store(m, result) # modifies media - # Media can be inside media properties, examples include transformations on original media - for prop in m.properties.values(): - if isinstance(prop, Media): - s.store(prop, result) - if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media): - for prop_media in prop: - s.store(prop_media, result) + result.store() # 6 - format and store formatted if needed # enrichers typically need access to already stored URLs etc if (final_media := self.formatter.format(result)): - for s in self.storages: - s.store(final_media, result) + final_media.store() result.set_final_media(final_media) if result.is_empty(): diff --git a/src/auto_archiver/databases/gsheet_db.py b/src/auto_archiver/databases/gsheet_db.py index b28d8ed..077392b 100644 --- a/src/auto_archiver/databases/gsheet_db.py +++ b/src/auto_archiver/databases/gsheet_db.py @@ -5,8 +5,7 @@ from urllib.parse import quote from loguru import logger from . import Database -from ..core import Metadata -from ..core import Media +from ..core import Metadata, Media, ArchivingContext from ..utils import GWorksheet @@ -86,7 +85,7 @@ class GsheetsDb(Database): logger.debug(f"Unable to update sheet: {e}") def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]: - # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now - gw: GWorksheet = item.get("gsheet").get("worksheet") - row: int = item.get("gsheet").get("row") + # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now + gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet") + row: int = ArchivingContext.get("gsheet").get("row") return gw, row diff --git a/src/auto_archiver/feeders/cli_feeder.py b/src/auto_archiver/feeders/cli_feeder.py index 8de3601..b2f0add 100644 --- a/src/auto_archiver/feeders/cli_feeder.py +++ b/src/auto_archiver/feeders/cli_feeder.py @@ -1,7 +1,7 @@ from loguru import logger from . import Feeder -from ..core import Metadata +from ..core import Metadata, ArchivingContext class CLIFeeder(Feeder): @@ -26,5 +26,7 @@ class CLIFeeder(Feeder): def __iter__(self) -> Metadata: for url in self.urls: logger.debug(f"Processing {url}") - yield Metadata().set_url(url).set("folder", "cli", True) + yield Metadata().set_url(url) + ArchivingContext.set("folder", "cli") + logger.success(f"Processed {len(self.urls)} URL(s)") diff --git a/src/auto_archiver/feeders/gsheet_feeder.py b/src/auto_archiver/feeders/gsheet_feeder.py index 42fdb54..152ac54 100644 --- a/src/auto_archiver/feeders/gsheet_feeder.py +++ b/src/auto_archiver/feeders/gsheet_feeder.py @@ -5,9 +5,10 @@ from slugify import slugify # from . import Enricher from . import Feeder -from ..core import Metadata +from ..core import Metadata, ArchivingContext from ..utils import Gsheets, GWorksheet + class GsheetsFeeder(Gsheets, Feeder): name = "gsheet_feeder" @@ -31,7 +32,7 @@ class GsheetsFeeder(Gsheets, Feeder): "help": "(CSV) explicitly block some worksheets from being processed", "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) }, - "use_sheet_names_in_stored_paths":{ + "use_sheet_names_in_stored_paths": { "default": True, "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", } @@ -61,11 +62,12 @@ class GsheetsFeeder(Gsheets, Feeder): if status not in ['', None]: continue # All checks done - archival process starts here - m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True) + m = Metadata().set_url(url) + ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True) if self.use_sheet_names_in_stored_paths: - m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True) + ArchivingContext.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True) yield m - + logger.success(f'Finished worksheet {wks.title}') def should_process_sheet(self, sheet_name: str) -> bool: diff --git a/src/auto_archiver/formatters/templates/html_template.html b/src/auto_archiver/formatters/templates/html_template.html index 68e54d5..901c0c6 100644 --- a/src/auto_archiver/formatters/templates/html_template.html +++ b/src/auto_archiver/formatters/templates/html_template.html @@ -29,7 +29,7 @@ margin: auto; border: 1px solid; border-collapse: collapse; - vertical-align:top; + vertical-align: top; } table.metadata td:first-child { @@ -185,7 +185,11 @@ el.addEventListener("copy", (e) => { e.preventDefault(); if (e.clipboardData) { - e.clipboardData.setData("text/plain", el.textContent); + if (el.hasAttribute("copy-value")) { + e.clipboardData.setData("text/plain", el.getAttribute("copy-value")); + } else { + e.clipboardData.setData("text/plain", el.textContent); + } console.log(e.clipboardData.getData("text")) showNotification("copied!") } diff --git a/src/auto_archiver/formatters/templates/macros.html b/src/auto_archiver/formatters/templates/macros.html index 658fd40..e72f4f3 100644 --- a/src/auto_archiver/formatters/templates/macros.html +++ b/src/auto_archiver/formatters/templates/macros.html @@ -46,14 +46,16 @@ No preview available for {{ m.key }}. {% endif %} {% if links %} open or -download +download or +{{ copy_urlize(url, "copy") }} +
{% endif %} {% endfor %} {%- endmacro -%} -{% macro copy_urlize(val) -%} +{% macro copy_urlize(val, href_text) -%} {% if val is mapping %} {% else %} +{% if href_text | length == 0 %} {{ val | string | urlize }} +{% else %} +{{ href_text | string | urlize }} +{% endif %} {% endif %} {%- endmacro -%} \ No newline at end of file diff --git a/src/auto_archiver/storages/storage.py b/src/auto_archiver/storages/storage.py index c8bb1c7..add7242 100644 --- a/src/auto_archiver/storages/storage.py +++ b/src/auto_archiver/storages/storage.py @@ -1,10 +1,9 @@ from __future__ import annotations from abc import abstractmethod from dataclasses import dataclass -import hashlib -from typing import IO, Any +from typing import IO -from ..core import Media, Metadata, Step +from ..core import Media, Step, ArchivingContext from ..enrichers import HashEnricher from loguru import logger import os, uuid @@ -42,10 +41,11 @@ class Storage(Step): # only for typing... return Step.init(name, config, Storage) - def store(self, media: Media, item: Metadata) -> None: - self.set_key(media, item) + def store(self, media: Media, url: str) -> None: + self.set_key(media, url) self.upload(media) media.add_url(self.get_cdn_url(media)) + media. @abstractmethod def get_cdn_url(self, media: Media) -> str: pass @@ -58,19 +58,19 @@ class Storage(Step): with open(media.filename, 'rb') as f: return self.uploadf(f, media, **kwargs) - def set_key(self, media: Media, item: Metadata) -> None: + def set_key(self, media: Media, url) -> None: """takes the media and optionally item info and generates a key""" if media.key is not None and len(media.key) > 0: return - folder = item.get("folder", "") + folder = ArchivingContext.get("folder", "") filename, ext = os.path.splitext(media.filename) # path_generator logic if self.path_generator == "flat": path = "" filename = slugify(filename) # in case it comes with os.sep - elif self.path_generator == "url": path = slugify(item.get_url()) + elif self.path_generator == "url": path = slugify(url) elif self.path_generator == "random": - path = item.get("random_path", str(uuid.uuid4())[:16], True) + path = ArchivingContext.get("random_path", str(uuid.uuid4())[:16], True) # filename_generator logic if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]