diff --git a/src/auto_archiver/core/metadata.py b/src/auto_archiver/core/metadata.py
index 4707a2f..dfbd2ec 100644
--- a/src/auto_archiver/core/metadata.py
+++ b/src/auto_archiver/core/metadata.py
@@ -11,18 +11,19 @@ from dateutil.parser import parse as parse_dt
from .media import Media
# annotation order matters
+
+
@dataclass_json
@dataclass
class Metadata:
status: str = "no archiver"
- _processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
+ _processed_at: datetime = field(default_factory=datetime.datetime.utcnow)
metadata: Dict[str, Any] = field(default_factory=dict)
- tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude":True}) # keys that are not to be saved in DBs
+ tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs
media: List[Media] = field(default_factory=list)
final_media: Media = None # can be overwritten by formatters
rearchivable: bool = False
-
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
merges two Metadata instances, will overwrite according to overwrite_left flag
diff --git a/src/auto_archiver/feeders/gsheet_feeder.py b/src/auto_archiver/feeders/gsheet_feeder.py
index 19b1fbc..c73015f 100644
--- a/src/auto_archiver/feeders/gsheet_feeder.py
+++ b/src/auto_archiver/feeders/gsheet_feeder.py
@@ -33,6 +33,10 @@ class GsheetsFeeder(Gsheets, Feeder):
"default": set(),
"help": "(CSV) explicitly block some worksheets from being processed",
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
+ },
+ "use_sheet_names_in_stored_paths":{
+ "default": True,
+ "help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
}
})
@@ -60,7 +64,10 @@ class GsheetsFeeder(Gsheets, Feeder):
if status not in ['', None]: continue
# All checks done - archival process starts here
- yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True).set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
+ m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
+ if self.use_sheet_names_in_stored_paths:
+ m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
+ yield m
logger.success(f'Finished worksheet {wks.title}')
diff --git a/src/auto_archiver/formatters/templates/macros.html b/src/auto_archiver/formatters/templates/macros.html
index 6faf891..658fd40 100644
--- a/src/auto_archiver/formatters/templates/macros.html
+++ b/src/auto_archiver/formatters/templates/macros.html
@@ -5,18 +5,37 @@
No URL available for {{ m.key }}.
{% elif 'http' in url %}
{% if 'image' in m.mimetype %}
-
-
-
+
{% elif 'video' in m.mimetype %}
-
+
+
+
{% elif 'audio' in m.mimetype %}
-
+
+
+
{% elif m.filename | get_extension == ".wacz" %}
replayweb
{% else %}
@@ -26,9 +45,9 @@ No preview available for {{ m.key }}.
{{ m.url | urlize }}
{% endif %}
{% if links %}
-
open or
download
+
{% endif %}
{% endfor %}
diff --git a/src/auto_archiver/storages/local.py b/src/auto_archiver/storages/local.py
index f5768a9..f4fb6bc 100644
--- a/src/auto_archiver/storages/local.py
+++ b/src/auto_archiver/storages/local.py
@@ -20,13 +20,15 @@ class LocalStorage(Storage):
@staticmethod
def configs() -> dict:
- return {
- "save_to": {"default": "./archived", "help": "folder where to save archived content"},
- "flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"},
- "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"},
- }
+ return dict(
+ Storage.configs(),
+ ** {
+ "save_to": {"default": "./archived", "help": "folder where to save archived content"},
+ "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
+ })
def get_cdn_url(self, media: Media) -> str:
+ #TODO: is this viable with Storage.configs on path/filename?
dest = os.path.join(self.save_to, media.key)
if self.save_absolute:
dest = os.path.abspath(dest)
@@ -34,14 +36,12 @@ class LocalStorage(Storage):
def upload(self, media: Media, **kwargs) -> bool:
# override parent so that we can use shutil.copy2 and keep metadata
- if self.flatten:
- dest = os.path.join(self.save_to, slugify(media.key))
- else:
- dest = os.path.join(self.save_to, media.key)
-
- os.makedirs(dest, exist_ok=True)
+ dest = os.path.join(self.save_to, media.key)
+ os.makedirs(os.path.dirname(dest), exist_ok=True)
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}')
- shutil.copy2(media.filename, dest)
+ res = shutil.copy2(media.filename, dest)
+ logger.info(res)
return True
+ # must be implemented even if unused
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
diff --git a/src/auto_archiver/storages/s3.py b/src/auto_archiver/storages/s3.py
index a88e2b2..f46ba43 100644
--- a/src/auto_archiver/storages/s3.py
+++ b/src/auto_archiver/storages/s3.py
@@ -24,23 +24,25 @@ class S3Storage(Storage):
@staticmethod
def configs() -> dict:
- return {
- "bucket": {"default": None, "help": "S3 bucket name"},
- "region": {"default": None, "help": "S3 region name"},
- "key": {"default": None, "help": "S3 API key"},
- "secret": {"default": None, "help": "S3 API secret"},
- # TODO: how to have sth like a custom folder? has to come from the feeders
- "endpoint_url": {
- "default": 'https://{region}.digitaloceanspaces.com',
- "help": "S3 bucket endpoint, {region} are inserted at runtime"
- },
- "cdn_url": {
- "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
- "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
- },
- "private": {"default": False, "help": "if true S3 files will not be readable online"},
- # "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
- }
+ return dict(
+ Storage.configs(),
+ ** {
+ "bucket": {"default": None, "help": "S3 bucket name"},
+ "region": {"default": None, "help": "S3 region name"},
+ "key": {"default": None, "help": "S3 API key"},
+ "secret": {"default": None, "help": "S3 API secret"},
+ # TODO: how to have sth like a custom folder? has to come from the feeders
+ "endpoint_url": {
+ "default": 'https://{region}.digitaloceanspaces.com',
+ "help": "S3 bucket endpoint, {region} are inserted at runtime"
+ },
+ "cdn_url": {
+ "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
+ "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
+ },
+ "private": {"default": False, "help": "if true S3 files will not be readable online"},
+ # "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
+ })
def get_cdn_url(self, media: Media) -> str:
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
diff --git a/src/auto_archiver/storages/storage.py b/src/auto_archiver/storages/storage.py
index bfd22f7..1bc301b 100644
--- a/src/auto_archiver/storages/storage.py
+++ b/src/auto_archiver/storages/storage.py
@@ -1,6 +1,7 @@
from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
+import hashlib
from typing import IO, Any
from ..core import Media, Metadata, Step
@@ -12,13 +13,32 @@ from slugify import slugify
@dataclass
class Storage(Step):
name = "storage"
+ PATH_GENERATOR_OPTIONS = ["flat", "url", "random"]
+ FILENAME_GENERATOR_CHOICES = ["random", "static"]
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
+ assert self.path_generator in Storage.PATH_GENERATOR_OPTIONS, f"path_generator must be one of {Storage.PATH_GENERATOR_OPTIONS}"
+ assert self.filename_generator in Storage.FILENAME_GENERATOR_CHOICES, f"filename_generator must be one of {Storage.FILENAME_GENERATOR_CHOICES}"
+
+ @staticmethod
+ def configs() -> dict:
+ return {
+ "path_generator": {
+ "default": "url",
+ "help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
+ "choices": Storage.PATH_GENERATOR_OPTIONS
+ },
+ "filename_generator": {
+ "default": "random",
+ "help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
+ "choices": Storage.FILENAME_GENERATOR_CHOICES
+ }
+ }
- # only for typing...
def init(name: str, config: dict) -> Storage:
+ # only for typing...
return Step.init(name, config, Storage)
def store(self, media: Media, item: Metadata) -> None:
@@ -38,10 +58,24 @@ class Storage(Step):
return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, item: Metadata) -> None:
- #TODO: accept options to make these predictable or random
"""takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0: return
folder = item.get("folder", "")
- ext = os.path.splitext(media.filename)[1]
- # media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
- media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}")
+ filename, ext = os.path.splitext(media.filename)
+
+ # path_generator logic
+ if self.path_generator == "flat":
+ path = ""
+ filename = slugify(filename) # in case it comes with os.sep
+ elif self.path_generator == "url": path = slugify(item.get_url())
+ elif self.path_generator == "random":
+ path = item.get("random_path", str(uuid.uuid4())[:16], True)
+
+ # filename_generator logic
+ if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
+ elif self.filename_generator == "static":
+ with open(media.filename, "rb") as f:
+ bytes = f.read() # read entire file as bytes
+ filename = hashlib.sha256(bytes).hexdigest()[:24]
+
+ media.key = os.path.join(folder, path, f"{filename}{ext}")
\ No newline at end of file