diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py index 4707a2f..dfbd2ec 100644 --- a/src/auto_archiver/core/metadata.py +++ b/src/auto_archiver/core/metadata.py @@ -11,18 +11,19 @@ from dateutil.parser import parse as parse_dt from .media import Media # annotation order matters + + @dataclass_json @dataclass class Metadata: status: str = "no archiver" - _processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc) + _processed_at: datetime = field(default_factory=datetime.datetime.utcnow) metadata: Dict[str, Any] = field(default_factory=dict) - tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude":True}) # keys that are not to be saved in DBs + tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs media: List[Media] = field(default_factory=list) final_media: Media = None # can be overwritten by formatters rearchivable: bool = False - def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: """ merges two Metadata instances, will overwrite according to overwrite_left flag diff --git a/src/auto_archiver/feeders/gsheet_feeder.py b/src/auto_archiver/feeders/gsheet_feeder.py index 19b1fbc..c73015f 100644 --- a/src/auto_archiver/feeders/gsheet_feeder.py +++ b/src/auto_archiver/feeders/gsheet_feeder.py @@ -33,6 +33,10 @@ class GsheetsFeeder(Gsheets, Feeder): "default": set(), "help": "(CSV) explicitly block some worksheets from being processed", "cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) + }, + "use_sheet_names_in_stored_paths":{ + "default": True, + "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'", } }) @@ -60,7 +64,10 @@ class GsheetsFeeder(Gsheets, Feeder): if status not in ['', None]: continue # All checks done - archival process starts here - yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True).set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True) + m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True) + if self.use_sheet_names_in_stored_paths: + m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True) + yield m logger.success(f'Finished worksheet {wks.title}') diff --git a/src/auto_archiver/formatters/templates/macros.html b/src/auto_archiver/formatters/templates/macros.html index 6faf891..658fd40 100644 --- a/src/auto_archiver/formatters/templates/macros.html +++ b/src/auto_archiver/formatters/templates/macros.html @@ -5,18 +5,37 @@ No URL available for {{ m.key }}. {% elif 'http' in url %} {% if 'image' in m.mimetype %} - - - +
+ + + + +
+ Reverse Image Search:  + Google,  + Google Lens,  + Yandex,  + Bing,  + Tineye,  + IQDB,  + SauceNAO,  + IMGOPS +
+

+
{% elif 'video' in m.mimetype %} - +
+ +
{% elif 'audio' in m.mimetype %} - +
+ +
{% elif m.filename | get_extension == ".wacz" %} replayweb {% else %} @@ -26,9 +45,9 @@ No preview available for {{ m.key }}. {{ m.url | urlize }} {% endif %} {% if links %} -
open or download +
{% endif %} {% endfor %} diff --git a/src/auto_archiver/storages/local.py b/src/auto_archiver/storages/local.py index f5768a9..f4fb6bc 100644 --- a/src/auto_archiver/storages/local.py +++ b/src/auto_archiver/storages/local.py @@ -20,13 +20,15 @@ class LocalStorage(Storage): @staticmethod def configs() -> dict: - return { - "save_to": {"default": "./archived", "help": "folder where to save archived content"}, - "flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"}, - "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"}, - } + return dict( + Storage.configs(), + ** { + "save_to": {"default": "./archived", "help": "folder where to save archived content"}, + "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"}, + }) def get_cdn_url(self, media: Media) -> str: + #TODO: is this viable with Storage.configs on path/filename? dest = os.path.join(self.save_to, media.key) if self.save_absolute: dest = os.path.abspath(dest) @@ -34,14 +36,12 @@ class LocalStorage(Storage): def upload(self, media: Media, **kwargs) -> bool: # override parent so that we can use shutil.copy2 and keep metadata - if self.flatten: - dest = os.path.join(self.save_to, slugify(media.key)) - else: - dest = os.path.join(self.save_to, media.key) - - os.makedirs(dest, exist_ok=True) + dest = os.path.join(self.save_to, media.key) + os.makedirs(os.path.dirname(dest), exist_ok=True) logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}') - shutil.copy2(media.filename, dest) + res = shutil.copy2(media.filename, dest) + logger.info(res) return True + # must be implemented even if unused def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass diff --git a/src/auto_archiver/storages/s3.py b/src/auto_archiver/storages/s3.py index a88e2b2..f46ba43 100644 --- a/src/auto_archiver/storages/s3.py +++ b/src/auto_archiver/storages/s3.py @@ -24,23 +24,25 @@ class S3Storage(Storage): @staticmethod def configs() -> dict: - return { - "bucket": {"default": None, "help": "S3 bucket name"}, - "region": {"default": None, "help": "S3 region name"}, - "key": {"default": None, "help": "S3 API key"}, - "secret": {"default": None, "help": "S3 API secret"}, - # TODO: how to have sth like a custom folder? has to come from the feeders - "endpoint_url": { - "default": 'https://{region}.digitaloceanspaces.com', - "help": "S3 bucket endpoint, {region} are inserted at runtime" - }, - "cdn_url": { - "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}', - "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" - }, - "private": {"default": False, "help": "if true S3 files will not be readable online"}, - # "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"}, - } + return dict( + Storage.configs(), + ** { + "bucket": {"default": None, "help": "S3 bucket name"}, + "region": {"default": None, "help": "S3 region name"}, + "key": {"default": None, "help": "S3 API key"}, + "secret": {"default": None, "help": "S3 API secret"}, + # TODO: how to have sth like a custom folder? has to come from the feeders + "endpoint_url": { + "default": 'https://{region}.digitaloceanspaces.com', + "help": "S3 bucket endpoint, {region} are inserted at runtime" + }, + "cdn_url": { + "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}', + "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" + }, + "private": {"default": False, "help": "if true S3 files will not be readable online"}, + # "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"}, + }) def get_cdn_url(self, media: Media) -> str: return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key) diff --git a/src/auto_archiver/storages/storage.py b/src/auto_archiver/storages/storage.py index bfd22f7..1bc301b 100644 --- a/src/auto_archiver/storages/storage.py +++ b/src/auto_archiver/storages/storage.py @@ -1,6 +1,7 @@ from __future__ import annotations from abc import abstractmethod from dataclasses import dataclass +import hashlib from typing import IO, Any from ..core import Media, Metadata, Step @@ -12,13 +13,32 @@ from slugify import slugify @dataclass class Storage(Step): name = "storage" + PATH_GENERATOR_OPTIONS = ["flat", "url", "random"] + FILENAME_GENERATOR_CHOICES = ["random", "static"] def __init__(self, config: dict) -> None: # without this STEP.__init__ is not called super().__init__(config) + assert self.path_generator in Storage.PATH_GENERATOR_OPTIONS, f"path_generator must be one of {Storage.PATH_GENERATOR_OPTIONS}" + assert self.filename_generator in Storage.FILENAME_GENERATOR_CHOICES, f"filename_generator must be one of {Storage.FILENAME_GENERATOR_CHOICES}" + + @staticmethod + def configs() -> dict: + return { + "path_generator": { + "default": "url", + "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.", + "choices": Storage.PATH_GENERATOR_OPTIONS + }, + "filename_generator": { + "default": "random", + "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.", + "choices": Storage.FILENAME_GENERATOR_CHOICES + } + } - # only for typing... def init(name: str, config: dict) -> Storage: + # only for typing... return Step.init(name, config, Storage) def store(self, media: Media, item: Metadata) -> None: @@ -38,10 +58,24 @@ class Storage(Step): return self.uploadf(f, media, **kwargs) def set_key(self, media: Media, item: Metadata) -> None: - #TODO: accept options to make these predictable or random """takes the media and optionally item info and generates a key""" if media.key is not None and len(media.key) > 0: return folder = item.get("folder", "") - ext = os.path.splitext(media.filename)[1] - # media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}") - media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}") + filename, ext = os.path.splitext(media.filename) + + # path_generator logic + if self.path_generator == "flat": + path = "" + filename = slugify(filename) # in case it comes with os.sep + elif self.path_generator == "url": path = slugify(item.get_url()) + elif self.path_generator == "random": + path = item.get("random_path", str(uuid.uuid4())[:16], True) + + # filename_generator logic + if self.filename_generator == "random": filename = str(uuid.uuid4())[:16] + elif self.filename_generator == "static": + with open(media.filename, "rb") as f: + bytes = f.read() # read entire file as bytes + filename = hashlib.sha256(bytes).hexdigest()[:24] + + media.key = os.path.join(folder, path, f"{filename}{ext}") \ No newline at end of file