From 6f6eb2db7a96e21778dea93e607f5d9322022aca Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Thu, 23 Mar 2023 14:28:45 +0000
Subject: [PATCH] Archiving Context refactor complete
---
src/auto_archiver/core/__init__.py | 2 +-
src/auto_archiver/core/context.py | 19 +++++++++-
src/auto_archiver/core/media.py | 37 ++++++++++++++-----
src/auto_archiver/core/metadata.py | 13 +++++--
src/auto_archiver/core/orchestrator.py | 17 ++-------
src/auto_archiver/databases/gsheet_db.py | 9 ++---
src/auto_archiver/feeders/cli_feeder.py | 6 ++-
src/auto_archiver/feeders/gsheet_feeder.py | 12 +++---
.../formatters/templates/html_template.html | 8 +++-
.../formatters/templates/macros.html | 10 ++++-
src/auto_archiver/storages/storage.py | 18 ++++-----
11 files changed, 96 insertions(+), 55 deletions(-)
diff --git a/src/auto_archiver/core/__init__.py b/src/auto_archiver/core/__init__.py
index d9a04bd..99765c7 100644
--- a/src/auto_archiver/core/__init__.py
+++ b/src/auto_archiver/core/__init__.py
@@ -1,5 +1,5 @@
-from .media import Media
from .metadata import Metadata
+from .media import Media
from .step import Step
from .context import ArchivingContext
diff --git a/src/auto_archiver/core/context.py b/src/auto_archiver/core/context.py
index c1709e7..fe06e41 100644
--- a/src/auto_archiver/core/context.py
+++ b/src/auto_archiver/core/context.py
@@ -1,3 +1,5 @@
+from loguru import logger
+
class ArchivingContext:
"""
@@ -7,11 +9,15 @@ class ArchivingContext:
ArchivingContext.set(key, value)
and
ArchivingContext.get(key, default)
+
+ When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
+ reset(full_reset=True) will recreate everything including the keep_on_reset status
"""
_instance = None
def __init__(self):
self.configs = {}
+ self.keep_on_reset = set()
@staticmethod
def get_instance():
@@ -20,13 +26,22 @@ class ArchivingContext:
return ArchivingContext._instance
@staticmethod
- def set(key, value):
- ArchivingContext.get_instance().configs[key] = value
+ def set(key, value, keep_on_reset: bool = False):
+ logger.error(f"SET [{key}]={value}")
+ ac = ArchivingContext.get_instance()
+ ac.configs[key] = value
+ if keep_on_reset: ac.keep_on_reset.add(key)
@staticmethod
def get(key: str, default=None):
return ArchivingContext.get_instance().configs.get(key, default)
+ @staticmethod
+ def reset(full_reset: bool = False):
+ ac = ArchivingContext.get_instance()
+ if full_reset: ac.keep_on_reset = set()
+ ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
+
# ---- custom getters/setters for widely used context values
@staticmethod
diff --git a/src/auto_archiver/core/media.py b/src/auto_archiver/core/media.py
index 53f2a0b..9f15bdc 100644
--- a/src/auto_archiver/core/media.py
+++ b/src/auto_archiver/core/media.py
@@ -3,19 +3,43 @@ from __future__ import annotations
from ast import List
from typing import Any
from dataclasses import dataclass, field
-from dataclasses_json import dataclass_json
+from dataclasses_json import dataclass_json, config
import mimetypes
+from .context import ArchivingContext
+
+from loguru import logger
-@dataclass_json # annotation order matters
+@dataclass_json # annotation order matters
@dataclass
class Media:
filename: str
key: str = None
urls: List[str] = field(default_factory=list)
- _mimetype: str = None # eg: image/jpeg
properties: dict = field(default_factory=dict)
+ _mimetype: str = None # eg: image/jpeg
+ _stored: bool = field(default=False, repr=False, metadata=config(exclude=True))
+
+ def store(self: Media, override_storages: List = None, url: str = "url-not-available"):
+ # stores the media into the provided/available storages [Storage]
+ # repeats the process for its properties, in case they have inner media themselves
+ # for now it only goes down 1 level but it's easy to make it recursive if needed
+ storages = override_storages or ArchivingContext.get("storages")
+ if not len(storages):
+ logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
+ return
+
+ for s in storages:
+ s.store(self, url)
+ # Media can be inside media properties, examples include transformations on original media
+ for prop in self.properties.values():
+ if isinstance(prop, Media):
+ s.store(prop, url)
+ if isinstance(prop, list):
+ for prop_media in prop:
+ if isinstance(prop_media, Media):
+ s.store(prop_media, url)
def set(self, key: str, value: Any) -> Media:
self.properties[key] = value
@@ -44,10 +68,3 @@ class Media:
def is_audio(self) -> bool:
return self.mimetype.startswith("audio")
-
- def store(self):
- """
- either stores this media entry and all its media descendants
- or returns if that process is already completed
- """
- pass
diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py
index 2ae583d..09791bf 100644
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -8,9 +8,10 @@ import datetime
from urllib.parse import urlparse
from dateutil.parser import parse as parse_dt
from .media import Media
+from .context import ArchivingContext
-# annotation order matters
-@dataclass_json
+
+@dataclass_json # annotation order matters
@dataclass
class Metadata:
status: str = "no archiver"
@@ -23,7 +24,6 @@ class Metadata:
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata=config(exclude=True))
# tmp_metadata: Dict[str, Any] = field(default_factory=dict, repr=False, metadata=config(exclude=True)) # contains internal properties not to be leaked when .to_json/repr/str is called
-
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
merges two Metadata instances, will overwrite according to overwrite_left flag
@@ -46,6 +46,12 @@ class Metadata:
return right.merge(self)
return self
+ def store(self: Metadata, override_storages: List = None):
+ # calls .store for all contained media. storages [Storage]
+ storages = override_storages or ArchivingContext.get("storages")
+ for media in self.media:
+ media.store(override_storages=storages)
+
def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
# if not self.metadata: self.metadata = {}
self.metadata[key] = val
@@ -144,4 +150,3 @@ class Metadata:
def __str__(self) -> str:
return self.__repr__()
-
\ No newline at end of file
diff --git a/src/auto_archiver/core/orchestrator.py b/src/auto_archiver/core/orchestrator.py
index 03339fc..1835ae2 100644
--- a/src/auto_archiver/core/orchestrator.py
+++ b/src/auto_archiver/core/orchestrator.py
@@ -25,7 +25,7 @@ class ArchivingOrchestrator:
self.archivers: List[Archiver] = config.archivers
self.databases: List[Database] = config.databases
self.storages: List[Storage] = config.storages
- ArchivingContext.set("storages", self.storages)
+ ArchivingContext.set("storages", self.storages, keep_on_reset=True)
for a in self.archivers: a.setup()
@@ -35,6 +35,7 @@ class ArchivingOrchestrator:
def feed_item(self, item: Metadata) -> Metadata:
try:
+ ArchivingContext.reset()
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
ArchivingContext.set_tmp_dir(tmp_dir)
return self.archive(item)
@@ -108,22 +109,12 @@ class ArchivingOrchestrator:
# 5 - store media
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
- for s in self.storages:
- for m in result.media:
- s.store(m, result) # modifies media
- # Media can be inside media properties, examples include transformations on original media
- for prop in m.properties.values():
- if isinstance(prop, Media):
- s.store(prop, result)
- if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
- for prop_media in prop:
- s.store(prop_media, result)
+ result.store()
# 6 - format and store formatted if needed
# enrichers typically need access to already stored URLs etc
if (final_media := self.formatter.format(result)):
- for s in self.storages:
- s.store(final_media, result)
+ final_media.store()
result.set_final_media(final_media)
if result.is_empty():
diff --git a/src/auto_archiver/databases/gsheet_db.py b/src/auto_archiver/databases/gsheet_db.py
index b28d8ed..077392b 100644
--- a/src/auto_archiver/databases/gsheet_db.py
+++ b/src/auto_archiver/databases/gsheet_db.py
@@ -5,8 +5,7 @@ from urllib.parse import quote
from loguru import logger
from . import Database
-from ..core import Metadata
-from ..core import Media
+from ..core import Metadata, Media, ArchivingContext
from ..utils import GWorksheet
@@ -86,7 +85,7 @@ class GsheetsDb(Database):
logger.debug(f"Unable to update sheet: {e}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
- # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
- gw: GWorksheet = item.get("gsheet").get("worksheet")
- row: int = item.get("gsheet").get("row")
+ # TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
+ gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
+ row: int = ArchivingContext.get("gsheet").get("row")
return gw, row
diff --git a/src/auto_archiver/feeders/cli_feeder.py b/src/auto_archiver/feeders/cli_feeder.py
index 8de3601..b2f0add 100644
--- a/src/auto_archiver/feeders/cli_feeder.py
+++ b/src/auto_archiver/feeders/cli_feeder.py
@@ -1,7 +1,7 @@
from loguru import logger
from . import Feeder
-from ..core import Metadata
+from ..core import Metadata, ArchivingContext
class CLIFeeder(Feeder):
@@ -26,5 +26,7 @@ class CLIFeeder(Feeder):
def __iter__(self) -> Metadata:
for url in self.urls:
logger.debug(f"Processing {url}")
- yield Metadata().set_url(url).set("folder", "cli", True)
+ yield Metadata().set_url(url)
+ ArchivingContext.set("folder", "cli")
+
logger.success(f"Processed {len(self.urls)} URL(s)")
diff --git a/src/auto_archiver/feeders/gsheet_feeder.py b/src/auto_archiver/feeders/gsheet_feeder.py
index 42fdb54..152ac54 100644
--- a/src/auto_archiver/feeders/gsheet_feeder.py
+++ b/src/auto_archiver/feeders/gsheet_feeder.py
@@ -5,9 +5,10 @@ from slugify import slugify
# from . import Enricher
from . import Feeder
-from ..core import Metadata
+from ..core import Metadata, ArchivingContext
from ..utils import Gsheets, GWorksheet
+
class GsheetsFeeder(Gsheets, Feeder):
name = "gsheet_feeder"
@@ -31,7 +32,7 @@ class GsheetsFeeder(Gsheets, Feeder):
"help": "(CSV) explicitly block some worksheets from being processed",
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
},
- "use_sheet_names_in_stored_paths":{
+ "use_sheet_names_in_stored_paths": {
"default": True,
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
}
@@ -61,11 +62,12 @@ class GsheetsFeeder(Gsheets, Feeder):
if status not in ['', None]: continue
# All checks done - archival process starts here
- m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
+ m = Metadata().set_url(url)
+ ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
if self.use_sheet_names_in_stored_paths:
- m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
+ ArchivingContext.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
yield m
-
+
logger.success(f'Finished worksheet {wks.title}')
def should_process_sheet(self, sheet_name: str) -> bool:
diff --git a/src/auto_archiver/formatters/templates/html_template.html b/src/auto_archiver/formatters/templates/html_template.html
index 68e54d5..901c0c6 100644
--- a/src/auto_archiver/formatters/templates/html_template.html
+++ b/src/auto_archiver/formatters/templates/html_template.html
@@ -29,7 +29,7 @@
margin: auto;
border: 1px solid;
border-collapse: collapse;
- vertical-align:top;
+ vertical-align: top;
}
table.metadata td:first-child {
@@ -185,7 +185,11 @@
el.addEventListener("copy", (e) => {
e.preventDefault();
if (e.clipboardData) {
- e.clipboardData.setData("text/plain", el.textContent);
+ if (el.hasAttribute("copy-value")) {
+ e.clipboardData.setData("text/plain", el.getAttribute("copy-value"));
+ } else {
+ e.clipboardData.setData("text/plain", el.textContent);
+ }
console.log(e.clipboardData.getData("text"))
showNotification("copied!")
}
diff --git a/src/auto_archiver/formatters/templates/macros.html b/src/auto_archiver/formatters/templates/macros.html
index 658fd40..e72f4f3 100644
--- a/src/auto_archiver/formatters/templates/macros.html
+++ b/src/auto_archiver/formatters/templates/macros.html
@@ -46,14 +46,16 @@ No preview available for {{ m.key }}.
{% endif %}
{% if links %}
open or
-download
+download or
+{{ copy_urlize(url, "copy") }}
+
{% endif %}
{% endfor %}
{%- endmacro -%}
-{% macro copy_urlize(val) -%}
+{% macro copy_urlize(val, href_text) -%}
{% if val is mapping %}