Archiving Context refactor complete

pull/74/head
msramalho 2023-03-23 14:28:45 +00:00
rodzic 906ed0f6e0
commit 6f6eb2db7a
11 zmienionych plików z 96 dodań i 55 usunięć

Wyświetl plik

@ -1,5 +1,5 @@
from .media import Media
from .metadata import Metadata
from .media import Media
from .step import Step
from .context import ArchivingContext

Wyświetl plik

@ -1,3 +1,5 @@
from loguru import logger
class ArchivingContext:
"""
@ -7,11 +9,15 @@ class ArchivingContext:
ArchivingContext.set(key, value)
and
ArchivingContext.get(key, default)
When reset is called, all values are cleared EXCEPT if they were .set(keep_on_reset=True)
reset(full_reset=True) will recreate everything including the keep_on_reset status
"""
_instance = None
def __init__(self):
self.configs = {}
self.keep_on_reset = set()
@staticmethod
def get_instance():
@ -20,13 +26,22 @@ class ArchivingContext:
return ArchivingContext._instance
@staticmethod
def set(key, value):
ArchivingContext.get_instance().configs[key] = value
def set(key, value, keep_on_reset: bool = False):
logger.error(f"SET [{key}]={value}")
ac = ArchivingContext.get_instance()
ac.configs[key] = value
if keep_on_reset: ac.keep_on_reset.add(key)
@staticmethod
def get(key: str, default=None):
return ArchivingContext.get_instance().configs.get(key, default)
@staticmethod
def reset(full_reset: bool = False):
ac = ArchivingContext.get_instance()
if full_reset: ac.keep_on_reset = set()
ac.configs = {k: v for k, v in ac.configs.items() if k in ac.keep_on_reset}
# ---- custom getters/setters for widely used context values
@staticmethod

Wyświetl plik

@ -3,19 +3,43 @@ from __future__ import annotations
from ast import List
from typing import Any
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json
from dataclasses_json import dataclass_json, config
import mimetypes
from .context import ArchivingContext
from loguru import logger
@dataclass_json # annotation order matters
@dataclass_json # annotation order matters
@dataclass
class Media:
filename: str
key: str = None
urls: List[str] = field(default_factory=list)
_mimetype: str = None # eg: image/jpeg
properties: dict = field(default_factory=dict)
_mimetype: str = None # eg: image/jpeg
_stored: bool = field(default=False, repr=False, metadata=config(exclude=True))
def store(self: Media, override_storages: List = None, url: str = "url-not-available"):
# stores the media into the provided/available storages [Storage]
# repeats the process for its properties, in case they have inner media themselves
# for now it only goes down 1 level but it's easy to make it recursive if needed
storages = override_storages or ArchivingContext.get("storages")
if not len(storages):
logger.warning(f"No storages found in local context or provided directly for {self.filename}.")
return
for s in storages:
s.store(self, url)
# Media can be inside media properties, examples include transformations on original media
for prop in self.properties.values():
if isinstance(prop, Media):
s.store(prop, url)
if isinstance(prop, list):
for prop_media in prop:
if isinstance(prop_media, Media):
s.store(prop_media, url)
def set(self, key: str, value: Any) -> Media:
self.properties[key] = value
@ -44,10 +68,3 @@ class Media:
def is_audio(self) -> bool:
return self.mimetype.startswith("audio")
def store(self):
"""
either stores this media entry and all its media descendants
or returns if that process is already completed
"""
pass

Wyświetl plik

@ -8,9 +8,10 @@ import datetime
from urllib.parse import urlparse
from dateutil.parser import parse as parse_dt
from .media import Media
from .context import ArchivingContext
# annotation order matters
@dataclass_json
@dataclass_json # annotation order matters
@dataclass
class Metadata:
status: str = "no archiver"
@ -23,7 +24,6 @@ class Metadata:
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata=config(exclude=True))
# tmp_metadata: Dict[str, Any] = field(default_factory=dict, repr=False, metadata=config(exclude=True)) # contains internal properties not to be leaked when .to_json/repr/str is called
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
merges two Metadata instances, will overwrite according to overwrite_left flag
@ -46,6 +46,12 @@ class Metadata:
return right.merge(self)
return self
def store(self: Metadata, override_storages: List = None):
# calls .store for all contained media. storages [Storage]
storages = override_storages or ArchivingContext.get("storages")
for media in self.media:
media.store(override_storages=storages)
def set(self, key: str, val: Any, is_tmp=False) -> Metadata:
# if not self.metadata: self.metadata = {}
self.metadata[key] = val
@ -144,4 +150,3 @@ class Metadata:
def __str__(self) -> str:
return self.__repr__()

Wyświetl plik

@ -25,7 +25,7 @@ class ArchivingOrchestrator:
self.archivers: List[Archiver] = config.archivers
self.databases: List[Database] = config.databases
self.storages: List[Storage] = config.storages
ArchivingContext.set("storages", self.storages)
ArchivingContext.set("storages", self.storages, keep_on_reset=True)
for a in self.archivers: a.setup()
@ -35,6 +35,7 @@ class ArchivingOrchestrator:
def feed_item(self, item: Metadata) -> Metadata:
try:
ArchivingContext.reset()
with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
ArchivingContext.set_tmp_dir(tmp_dir)
return self.archive(item)
@ -108,22 +109,12 @@ class ArchivingOrchestrator:
# 5 - store media
# looks for Media in result.media and also result.media[x].properties (as list or dict values)
for s in self.storages:
for m in result.media:
s.store(m, result) # modifies media
# Media can be inside media properties, examples include transformations on original media
for prop in m.properties.values():
if isinstance(prop, Media):
s.store(prop, result)
if isinstance(prop, list) and len(prop) > 0 and isinstance(prop[0], Media):
for prop_media in prop:
s.store(prop_media, result)
result.store()
# 6 - format and store formatted if needed
# enrichers typically need access to already stored URLs etc
if (final_media := self.formatter.format(result)):
for s in self.storages:
s.store(final_media, result)
final_media.store()
result.set_final_media(final_media)
if result.is_empty():

Wyświetl plik

@ -5,8 +5,7 @@ from urllib.parse import quote
from loguru import logger
from . import Database
from ..core import Metadata
from ..core import Media
from ..core import Metadata, Media, ArchivingContext
from ..utils import GWorksheet
@ -86,7 +85,7 @@ class GsheetsDb(Database):
logger.debug(f"Unable to update sheet: {e}")
def _retrieve_gsheet(self, item: Metadata) -> Tuple[GWorksheet, int]:
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from item and, if missing, manage its own singleton - not needed for now
gw: GWorksheet = item.get("gsheet").get("worksheet")
row: int = item.get("gsheet").get("row")
# TODO: to make gsheet_db less coupled with gsheet_feeder's "gsheet" parameter, this method could 1st try to fetch "gsheet" from ArchivingContext and, if missing, manage its own singleton - not needed for now
gw: GWorksheet = ArchivingContext.get("gsheet").get("worksheet")
row: int = ArchivingContext.get("gsheet").get("row")
return gw, row

Wyświetl plik

@ -1,7 +1,7 @@
from loguru import logger
from . import Feeder
from ..core import Metadata
from ..core import Metadata, ArchivingContext
class CLIFeeder(Feeder):
@ -26,5 +26,7 @@ class CLIFeeder(Feeder):
def __iter__(self) -> Metadata:
for url in self.urls:
logger.debug(f"Processing {url}")
yield Metadata().set_url(url).set("folder", "cli", True)
yield Metadata().set_url(url)
ArchivingContext.set("folder", "cli")
logger.success(f"Processed {len(self.urls)} URL(s)")

Wyświetl plik

@ -5,9 +5,10 @@ from slugify import slugify
# from . import Enricher
from . import Feeder
from ..core import Metadata
from ..core import Metadata, ArchivingContext
from ..utils import Gsheets, GWorksheet
class GsheetsFeeder(Gsheets, Feeder):
name = "gsheet_feeder"
@ -31,7 +32,7 @@ class GsheetsFeeder(Gsheets, Feeder):
"help": "(CSV) explicitly block some worksheets from being processed",
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
},
"use_sheet_names_in_stored_paths":{
"use_sheet_names_in_stored_paths": {
"default": True,
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
}
@ -61,11 +62,12 @@ class GsheetsFeeder(Gsheets, Feeder):
if status not in ['', None]: continue
# All checks done - archival process starts here
m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
m = Metadata().set_url(url)
ArchivingContext.set("gsheet", {"row": row, "worksheet": gw}, keep_on_reset=True)
if self.use_sheet_names_in_stored_paths:
m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
ArchivingContext.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
yield m
logger.success(f'Finished worksheet {wks.title}')
def should_process_sheet(self, sheet_name: str) -> bool:

Wyświetl plik

@ -29,7 +29,7 @@
margin: auto;
border: 1px solid;
border-collapse: collapse;
vertical-align:top;
vertical-align: top;
}
table.metadata td:first-child {
@ -185,7 +185,11 @@
el.addEventListener("copy", (e) => {
e.preventDefault();
if (e.clipboardData) {
e.clipboardData.setData("text/plain", el.textContent);
if (el.hasAttribute("copy-value")) {
e.clipboardData.setData("text/plain", el.getAttribute("copy-value"));
} else {
e.clipboardData.setData("text/plain", el.textContent);
}
console.log(e.clipboardData.getData("text"))
showNotification("copied!")
}

Wyświetl plik

@ -46,14 +46,16 @@ No preview available for {{ m.key }}.
{% endif %}
{% if links %}
<a href="{{ url }}">open</a> or
<a href="{{ url }}" download="">download</a>
<a href="{{ url }}" download="">download</a> or
{{ copy_urlize(url, "copy") }}
<br>
{% endif %}
{% endfor %}
{%- endmacro -%}
{% macro copy_urlize(val) -%}
{% macro copy_urlize(val, href_text) -%}
{% if val is mapping %}
<ul>
@ -65,7 +67,11 @@ No preview available for {{ m.key }}.
</ul>
{% else %}
{% if href_text | length == 0 %}
<span class="copy">{{ val | string | urlize }}</span>
{% else %}
<span class="copy" copy-value="{{val}}">{{ href_text | string | urlize }}</span>
{% endif %}
{% endif %}
{%- endmacro -%}

Wyświetl plik

@ -1,10 +1,9 @@
from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
import hashlib
from typing import IO, Any
from typing import IO
from ..core import Media, Metadata, Step
from ..core import Media, Step, ArchivingContext
from ..enrichers import HashEnricher
from loguru import logger
import os, uuid
@ -42,10 +41,11 @@ class Storage(Step):
# only for typing...
return Step.init(name, config, Storage)
def store(self, media: Media, item: Metadata) -> None:
self.set_key(media, item)
def store(self, media: Media, url: str) -> None:
self.set_key(media, url)
self.upload(media)
media.add_url(self.get_cdn_url(media))
media.
@abstractmethod
def get_cdn_url(self, media: Media) -> str: pass
@ -58,19 +58,19 @@ class Storage(Step):
with open(media.filename, 'rb') as f:
return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, item: Metadata) -> None:
def set_key(self, media: Media, url) -> None:
"""takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0: return
folder = item.get("folder", "")
folder = ArchivingContext.get("folder", "")
filename, ext = os.path.splitext(media.filename)
# path_generator logic
if self.path_generator == "flat":
path = ""
filename = slugify(filename) # in case it comes with os.sep
elif self.path_generator == "url": path = slugify(item.get_url())
elif self.path_generator == "url": path = slugify(url)
elif self.path_generator == "random":
path = item.get("random_path", str(uuid.uuid4())[:16], True)
path = ArchivingContext.get("random_path", str(uuid.uuid4())[:16], True)
# filename_generator logic
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]