minor improvements

2023-01-22 23:15:54 +00:00 · 2023-01-22 23:15:54 +00:00 · 9dd8afed8c
commit 9dd8afed8c
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@ -11,18 +11,19 @@ from dateutil.parser import parse as parse_dt
 from .media import Media

 # annotation order matters
+
+
@dataclass_json
@dataclass
 class Metadata:
    status: str = "no archiver"
-    _processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
+    _processed_at: datetime = field(default_factory=datetime.datetime.utcnow)
    metadata: Dict[str, Any] = field(default_factory=dict)
-    tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude":True})  # keys that are not to be saved in DBs
+    tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True})  # keys that are not to be saved in DBs
    media: List[Media] = field(default_factory=list)
    final_media: Media = None  # can be overwritten by formatters
    rearchivable: bool = False

-
    def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
        """
        merges two Metadata instances, will overwrite according to overwrite_left flag
--- a/src/auto_archiver/feeders/gsheet_feeder.py
+++ b/src/auto_archiver/feeders/gsheet_feeder.py
@ -33,6 +33,10 @@ class GsheetsFeeder(Gsheets, Feeder):
                    "default": set(),
                    "help": "(CSV) explicitly block some worksheets from being processed",
                    "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
+                },
+                "use_sheet_names_in_stored_paths":{
+                    "default": True,
+                    "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
                }
            })

@ -60,7 +64,10 @@ class GsheetsFeeder(Gsheets, Feeder):
                if status not in ['', None]: continue

                # All checks done - archival process starts here
-                yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True).set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
+                m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
+                if self.use_sheet_names_in_stored_paths:
+                    m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
+                yield m
                
            logger.success(f'Finished worksheet {wks.title}')

--- a/src/auto_archiver/formatters/templates/macros.html
+++ b/src/auto_archiver/formatters/templates/macros.html
@ -5,18 +5,37 @@
 No URL available for {{ m.key }}.
 {% elif 'http' in url %}
 {% if 'image' in m.mimetype %}
-<a href="{{ url }}">
-    <img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
-</a>
+<div>
+    <a href="{{ url }}">
+        <img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
+    </a>
+
+    <div>
+        Reverse Image Search:&nbsp;
+        <a href="https://www.google.com/searchbyimage?sbisrc=4chanx&image_url={{ url | quote }}&safe=off">Google</a>,&nbsp;
+        <a href="https://lens.google.com/uploadbyurl?url={{ url | quote }}">Google Lens</a>,&nbsp;
+        <a href="https://yandex.ru/images/touch/search?rpt=imageview&url={{ url | quote }}">Yandex</a>,&nbsp;
+        <a href="https://www.bing.com/images/search?view=detailv2&iss=sbi&form=SBIVSP&sbisrc=UrlPaste&q=imgurl:{{ url | quote }}">Bing</a>,&nbsp;
+        <a href="https://www.tineye.com/search/?url={{ url | quote }}">Tineye</a>,&nbsp;
+        <a href="https://iqdb.org/?url={{ url | quote }}">IQDB</a>,&nbsp;
+        <a href="https://saucenao.com/search.php?db=999&url={{ url | quote }}">SauceNAO</a>,&nbsp;
+        <a href="https://imgops.com/{{ url | quote }}">IMGOPS</a>
+    </div>
+    <p></p>
+</div>
 {% elif 'video' in m.mimetype %}
-<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
-    Your browser does not support the video element.
-</video>
+<div>
+    <video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
+        Your browser does not support the video element.
+    </video>
+</div>
 {% elif 'audio' in m.mimetype %}
-<audio controls>
-    <source src="{{ url }}" type="{{ m.mimetype }}">
-    Your browser does not support the audio element.
-</audio>
+<div>
+    <audio controls>
+        <source src="{{ url }}" type="{{ m.mimetype }}">
+        Your browser does not support the audio element.
+    </audio>
+</div>
 {% elif m.filename | get_extension == ".wacz" %}
 <a href="https://replayweb.page/?source={{ url | quote }}#view=pages&url={{ main_url }}">replayweb</a>
 {% else %}
@ -26,9 +45,9 @@ No preview available for {{ m.key }}.
 {{ m.url | urlize }}
 {% endif %}
 {% if links %}
-<br>
 <a href="{{ url }}">open</a> or
 <a href="{{ url }}" download="">download</a>
+<br>
 {% endif %}
 {% endfor %}

--- a/src/auto_archiver/storages/local.py
+++ b/src/auto_archiver/storages/local.py
@ -20,13 +20,15 @@ class LocalStorage(Storage):

    @staticmethod
    def configs() -> dict:
-        return {
-            "save_to": {"default": "./archived", "help": "folder where to save archived content"},
-            "flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"},
-            "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"},
-        }
+        return dict(
+            Storage.configs(),
+            ** {
+                "save_to": {"default": "./archived", "help": "folder where to save archived content"},
+                "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
+            })

    def get_cdn_url(self, media: Media) -> str:
+        #TODO: is this viable with Storage.configs on path/filename?
        dest = os.path.join(self.save_to, media.key)
        if self.save_absolute:
            dest = os.path.abspath(dest)
@ -34,14 +36,12 @@ class LocalStorage(Storage):

    def upload(self, media: Media, **kwargs) -> bool:
        # override parent so that we can use shutil.copy2 and keep metadata
-        if self.flatten:
-            dest = os.path.join(self.save_to, slugify(media.key))
-        else:
-            dest = os.path.join(self.save_to, media.key)
-
-        os.makedirs(dest, exist_ok=True)
+        dest = os.path.join(self.save_to, media.key)
+        os.makedirs(os.path.dirname(dest), exist_ok=True)
        logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}')
-        shutil.copy2(media.filename, dest)
+        res = shutil.copy2(media.filename, dest)
+        logger.info(res)
        return True

+    # must be implemented even if unused
    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
--- a/src/auto_archiver/storages/s3.py
+++ b/src/auto_archiver/storages/s3.py
@ -24,23 +24,25 @@ class S3Storage(Storage):

    @staticmethod
    def configs() -> dict:
-        return {
-            "bucket": {"default": None, "help": "S3 bucket name"},
-            "region": {"default": None, "help": "S3 region name"},
-            "key": {"default": None, "help": "S3 API key"},
-            "secret": {"default": None, "help": "S3 API secret"},
-            # TODO: how to have sth like a custom folder? has to come from the feeders
-            "endpoint_url": {
-                "default": 'https://{region}.digitaloceanspaces.com',
-                "help": "S3 bucket endpoint, {region} are inserted at runtime"
-            },
-            "cdn_url": {
-                "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
-                "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
-            },
-            "private": {"default": False, "help": "if true S3 files will not be readable online"},
-            # "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
-        }
+        return dict(
+            Storage.configs(),
+            ** {
+                "bucket": {"default": None, "help": "S3 bucket name"},
+                "region": {"default": None, "help": "S3 region name"},
+                "key": {"default": None, "help": "S3 API key"},
+                "secret": {"default": None, "help": "S3 API secret"},
+                # TODO: how to have sth like a custom folder? has to come from the feeders
+                "endpoint_url": {
+                    "default": 'https://{region}.digitaloceanspaces.com',
+                    "help": "S3 bucket endpoint, {region} are inserted at runtime"
+                },
+                "cdn_url": {
+                    "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
+                    "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
+                },
+                "private": {"default": False, "help": "if true S3 files will not be readable online"},
+                # "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
+            })

    def get_cdn_url(self, media: Media) -> str:
        return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
--- a/src/auto_archiver/storages/storage.py
+++ b/src/auto_archiver/storages/storage.py
@ -1,6 +1,7 @@
 from __future__ import annotations
 from abc import abstractmethod
 from dataclasses import dataclass
+import hashlib
 from typing import IO, Any

 from ..core import Media, Metadata, Step
@ -12,13 +13,32 @@ from slugify import slugify
@dataclass
 class Storage(Step):
    name = "storage"
+    PATH_GENERATOR_OPTIONS = ["flat", "url", "random"]
+    FILENAME_GENERATOR_CHOICES = ["random", "static"]

    def __init__(self, config: dict) -> None:
        # without this STEP.__init__ is not called
        super().__init__(config)
+        assert self.path_generator in Storage.PATH_GENERATOR_OPTIONS, f"path_generator must be one of {Storage.PATH_GENERATOR_OPTIONS}"
+        assert self.filename_generator in Storage.FILENAME_GENERATOR_CHOICES, f"filename_generator must be one of {Storage.FILENAME_GENERATOR_CHOICES}"
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "path_generator": {
+                "default": "url",
+                "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
+                "choices": Storage.PATH_GENERATOR_OPTIONS
+            },
+            "filename_generator": {
+                "default": "random",
+                "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
+                "choices": Storage.FILENAME_GENERATOR_CHOICES
+            }
+        }

-    # only for typing...
    def init(name: str, config: dict) -> Storage:
+        # only for typing...
        return Step.init(name, config, Storage)

    def store(self, media: Media, item: Metadata) -> None:
@ -38,10 +58,24 @@ class Storage(Step):
            return self.uploadf(f, media, **kwargs)

    def set_key(self, media: Media, item: Metadata) -> None:
-        #TODO: accept options to make these predictable or random
        """takes the media and optionally item info and generates a key"""
        if media.key is not None and len(media.key) > 0: return
        folder = item.get("folder", "")
-        ext = os.path.splitext(media.filename)[1]
-        # media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
-        media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}")
+        filename, ext = os.path.splitext(media.filename)
+
+        # path_generator logic
+        if self.path_generator == "flat": 
+            path = ""
+            filename = slugify(filename) # in case it comes with os.sep
+        elif self.path_generator == "url": path = slugify(item.get_url())
+        elif self.path_generator == "random":
+            path = item.get("random_path", str(uuid.uuid4())[:16], True)
+
+        # filename_generator logic
+        if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
+        elif self.filename_generator == "static": 
+            with open(media.filename, "rb") as f:
+                bytes = f.read()  # read entire file as bytes
+            filename = hashlib.sha256(bytes).hexdigest()[:24]
+
+        media.key = os.path.join(folder, path, f"{filename}{ext}")