minor improvements

pull/72/head
msramalho 2023-01-22 23:15:54 +00:00
rodzic 092ffdb6d8
commit 9dd8afed8c
6 zmienionych plików z 112 dodań i 49 usunięć

Wyświetl plik

@ -11,18 +11,19 @@ from dateutil.parser import parse as parse_dt
from .media import Media from .media import Media
# annotation order matters # annotation order matters
@dataclass_json @dataclass_json
@dataclass @dataclass
class Metadata: class Metadata:
status: str = "no archiver" status: str = "no archiver"
_processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc) _processed_at: datetime = field(default_factory=datetime.datetime.utcnow)
metadata: Dict[str, Any] = field(default_factory=dict) metadata: Dict[str, Any] = field(default_factory=dict)
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude":True}) # keys that are not to be saved in DBs tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs
media: List[Media] = field(default_factory=list) media: List[Media] = field(default_factory=list)
final_media: Media = None # can be overwritten by formatters final_media: Media = None # can be overwritten by formatters
rearchivable: bool = False rearchivable: bool = False
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata: def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
""" """
merges two Metadata instances, will overwrite according to overwrite_left flag merges two Metadata instances, will overwrite according to overwrite_left flag

Wyświetl plik

@ -33,6 +33,10 @@ class GsheetsFeeder(Gsheets, Feeder):
"default": set(), "default": set(),
"help": "(CSV) explicitly block some worksheets from being processed", "help": "(CSV) explicitly block some worksheets from being processed",
"cli_set": lambda cli_val, cur_val: set(cli_val.split(",")) "cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
},
"use_sheet_names_in_stored_paths":{
"default": True,
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
} }
}) })
@ -60,7 +64,10 @@ class GsheetsFeeder(Gsheets, Feeder):
if status not in ['', None]: continue if status not in ['', None]: continue
# All checks done - archival process starts here # All checks done - archival process starts here
yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True).set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True) m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
if self.use_sheet_names_in_stored_paths:
m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
yield m
logger.success(f'Finished worksheet {wks.title}') logger.success(f'Finished worksheet {wks.title}')

Wyświetl plik

@ -5,18 +5,37 @@
No URL available for {{ m.key }}. No URL available for {{ m.key }}.
{% elif 'http' in url %} {% elif 'http' in url %}
{% if 'image' in m.mimetype %} {% if 'image' in m.mimetype %}
<a href="{{ url }}"> <div>
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img> <a href="{{ url }}">
</a> <img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
</a>
<div>
Reverse Image Search:&nbsp;
<a href="https://www.google.com/searchbyimage?sbisrc=4chanx&image_url={{ url | quote }}&safe=off">Google</a>,&nbsp;
<a href="https://lens.google.com/uploadbyurl?url={{ url | quote }}">Google Lens</a>,&nbsp;
<a href="https://yandex.ru/images/touch/search?rpt=imageview&url={{ url | quote }}">Yandex</a>,&nbsp;
<a href="https://www.bing.com/images/search?view=detailv2&iss=sbi&form=SBIVSP&sbisrc=UrlPaste&q=imgurl:{{ url | quote }}">Bing</a>,&nbsp;
<a href="https://www.tineye.com/search/?url={{ url | quote }}">Tineye</a>,&nbsp;
<a href="https://iqdb.org/?url={{ url | quote }}">IQDB</a>,&nbsp;
<a href="https://saucenao.com/search.php?db=999&url={{ url | quote }}">SauceNAO</a>,&nbsp;
<a href="https://imgops.com/{{ url | quote }}">IMGOPS</a>
</div>
<p></p>
</div>
{% elif 'video' in m.mimetype %} {% elif 'video' in m.mimetype %}
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;"> <div>
Your browser does not support the video element. <video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
</video> Your browser does not support the video element.
</video>
</div>
{% elif 'audio' in m.mimetype %} {% elif 'audio' in m.mimetype %}
<audio controls> <div>
<source src="{{ url }}" type="{{ m.mimetype }}"> <audio controls>
Your browser does not support the audio element. <source src="{{ url }}" type="{{ m.mimetype }}">
</audio> Your browser does not support the audio element.
</audio>
</div>
{% elif m.filename | get_extension == ".wacz" %} {% elif m.filename | get_extension == ".wacz" %}
<a href="https://replayweb.page/?source={{ url | quote }}#view=pages&url={{ main_url }}">replayweb</a> <a href="https://replayweb.page/?source={{ url | quote }}#view=pages&url={{ main_url }}">replayweb</a>
{% else %} {% else %}
@ -26,9 +45,9 @@ No preview available for {{ m.key }}.
{{ m.url | urlize }} {{ m.url | urlize }}
{% endif %} {% endif %}
{% if links %} {% if links %}
<br>
<a href="{{ url }}">open</a> or <a href="{{ url }}">open</a> or
<a href="{{ url }}" download="">download</a> <a href="{{ url }}" download="">download</a>
<br>
{% endif %} {% endif %}
{% endfor %} {% endfor %}

Wyświetl plik

@ -20,13 +20,15 @@ class LocalStorage(Storage):
@staticmethod @staticmethod
def configs() -> dict: def configs() -> dict:
return { return dict(
"save_to": {"default": "./archived", "help": "folder where to save archived content"}, Storage.configs(),
"flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"}, ** {
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"}, "save_to": {"default": "./archived", "help": "folder where to save archived content"},
} "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
})
def get_cdn_url(self, media: Media) -> str: def get_cdn_url(self, media: Media) -> str:
#TODO: is this viable with Storage.configs on path/filename?
dest = os.path.join(self.save_to, media.key) dest = os.path.join(self.save_to, media.key)
if self.save_absolute: if self.save_absolute:
dest = os.path.abspath(dest) dest = os.path.abspath(dest)
@ -34,14 +36,12 @@ class LocalStorage(Storage):
def upload(self, media: Media, **kwargs) -> bool: def upload(self, media: Media, **kwargs) -> bool:
# override parent so that we can use shutil.copy2 and keep metadata # override parent so that we can use shutil.copy2 and keep metadata
if self.flatten: dest = os.path.join(self.save_to, media.key)
dest = os.path.join(self.save_to, slugify(media.key)) os.makedirs(os.path.dirname(dest), exist_ok=True)
else:
dest = os.path.join(self.save_to, media.key)
os.makedirs(dest, exist_ok=True)
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}') logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}')
shutil.copy2(media.filename, dest) res = shutil.copy2(media.filename, dest)
logger.info(res)
return True return True
# must be implemented even if unused
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass

Wyświetl plik

@ -24,23 +24,25 @@ class S3Storage(Storage):
@staticmethod @staticmethod
def configs() -> dict: def configs() -> dict:
return { return dict(
"bucket": {"default": None, "help": "S3 bucket name"}, Storage.configs(),
"region": {"default": None, "help": "S3 region name"}, ** {
"key": {"default": None, "help": "S3 API key"}, "bucket": {"default": None, "help": "S3 bucket name"},
"secret": {"default": None, "help": "S3 API secret"}, "region": {"default": None, "help": "S3 region name"},
# TODO: how to have sth like a custom folder? has to come from the feeders "key": {"default": None, "help": "S3 API key"},
"endpoint_url": { "secret": {"default": None, "help": "S3 API secret"},
"default": 'https://{region}.digitaloceanspaces.com', # TODO: how to have sth like a custom folder? has to come from the feeders
"help": "S3 bucket endpoint, {region} are inserted at runtime" "endpoint_url": {
}, "default": 'https://{region}.digitaloceanspaces.com',
"cdn_url": { "help": "S3 bucket endpoint, {region} are inserted at runtime"
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}', },
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime" "cdn_url": {
}, "default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
"private": {"default": False, "help": "if true S3 files will not be readable online"}, "help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
# "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"}, },
} "private": {"default": False, "help": "if true S3 files will not be readable online"},
# "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
})
def get_cdn_url(self, media: Media) -> str: def get_cdn_url(self, media: Media) -> str:
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key) return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)

Wyświetl plik

@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
from abc import abstractmethod from abc import abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
import hashlib
from typing import IO, Any from typing import IO, Any
from ..core import Media, Metadata, Step from ..core import Media, Metadata, Step
@ -12,13 +13,32 @@ from slugify import slugify
@dataclass @dataclass
class Storage(Step): class Storage(Step):
name = "storage" name = "storage"
PATH_GENERATOR_OPTIONS = ["flat", "url", "random"]
FILENAME_GENERATOR_CHOICES = ["random", "static"]
def __init__(self, config: dict) -> None: def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called # without this STEP.__init__ is not called
super().__init__(config) super().__init__(config)
assert self.path_generator in Storage.PATH_GENERATOR_OPTIONS, f"path_generator must be one of {Storage.PATH_GENERATOR_OPTIONS}"
assert self.filename_generator in Storage.FILENAME_GENERATOR_CHOICES, f"filename_generator must be one of {Storage.FILENAME_GENERATOR_CHOICES}"
@staticmethod
def configs() -> dict:
return {
"path_generator": {
"default": "url",
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
"choices": Storage.PATH_GENERATOR_OPTIONS
},
"filename_generator": {
"default": "random",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"choices": Storage.FILENAME_GENERATOR_CHOICES
}
}
# only for typing...
def init(name: str, config: dict) -> Storage: def init(name: str, config: dict) -> Storage:
# only for typing...
return Step.init(name, config, Storage) return Step.init(name, config, Storage)
def store(self, media: Media, item: Metadata) -> None: def store(self, media: Media, item: Metadata) -> None:
@ -38,10 +58,24 @@ class Storage(Step):
return self.uploadf(f, media, **kwargs) return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, item: Metadata) -> None: def set_key(self, media: Media, item: Metadata) -> None:
#TODO: accept options to make these predictable or random
"""takes the media and optionally item info and generates a key""" """takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0: return if media.key is not None and len(media.key) > 0: return
folder = item.get("folder", "") folder = item.get("folder", "")
ext = os.path.splitext(media.filename)[1] filename, ext = os.path.splitext(media.filename)
# media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}") # path_generator logic
if self.path_generator == "flat":
path = ""
filename = slugify(filename) # in case it comes with os.sep
elif self.path_generator == "url": path = slugify(item.get_url())
elif self.path_generator == "random":
path = item.get("random_path", str(uuid.uuid4())[:16], True)
# filename_generator logic
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
elif self.filename_generator == "static":
with open(media.filename, "rb") as f:
bytes = f.read() # read entire file as bytes
filename = hashlib.sha256(bytes).hexdigest()[:24]
media.key = os.path.join(folder, path, f"{filename}{ext}")