minor improvements

pull/72/head
msramalho 2023-01-22 23:15:54 +00:00
rodzic 092ffdb6d8
commit 9dd8afed8c
6 zmienionych plików z 112 dodań i 49 usunięć

Wyświetl plik

@ -11,18 +11,19 @@ from dateutil.parser import parse as parse_dt
from .media import Media
# annotation order matters
@dataclass_json
@dataclass
class Metadata:
status: str = "no archiver"
_processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
_processed_at: datetime = field(default_factory=datetime.datetime.utcnow)
metadata: Dict[str, Any] = field(default_factory=dict)
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude":True}) # keys that are not to be saved in DBs
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs
media: List[Media] = field(default_factory=list)
final_media: Media = None # can be overwritten by formatters
rearchivable: bool = False
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
"""
merges two Metadata instances, will overwrite according to overwrite_left flag

Wyświetl plik

@ -33,6 +33,10 @@ class GsheetsFeeder(Gsheets, Feeder):
"default": set(),
"help": "(CSV) explicitly block some worksheets from being processed",
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
},
"use_sheet_names_in_stored_paths":{
"default": True,
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
}
})
@ -60,7 +64,10 @@ class GsheetsFeeder(Gsheets, Feeder):
if status not in ['', None]: continue
# All checks done - archival process starts here
yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True).set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
if self.use_sheet_names_in_stored_paths:
m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
yield m
logger.success(f'Finished worksheet {wks.title}')

Wyświetl plik

@ -5,18 +5,37 @@
No URL available for {{ m.key }}.
{% elif 'http' in url %}
{% if 'image' in m.mimetype %}
<a href="{{ url }}">
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
</a>
<div>
<a href="{{ url }}">
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
</a>
<div>
Reverse Image Search:&nbsp;
<a href="https://www.google.com/searchbyimage?sbisrc=4chanx&image_url={{ url | quote }}&safe=off">Google</a>,&nbsp;
<a href="https://lens.google.com/uploadbyurl?url={{ url | quote }}">Google Lens</a>,&nbsp;
<a href="https://yandex.ru/images/touch/search?rpt=imageview&url={{ url | quote }}">Yandex</a>,&nbsp;
<a href="https://www.bing.com/images/search?view=detailv2&iss=sbi&form=SBIVSP&sbisrc=UrlPaste&q=imgurl:{{ url | quote }}">Bing</a>,&nbsp;
<a href="https://www.tineye.com/search/?url={{ url | quote }}">Tineye</a>,&nbsp;
<a href="https://iqdb.org/?url={{ url | quote }}">IQDB</a>,&nbsp;
<a href="https://saucenao.com/search.php?db=999&url={{ url | quote }}">SauceNAO</a>,&nbsp;
<a href="https://imgops.com/{{ url | quote }}">IMGOPS</a>
</div>
<p></p>
</div>
{% elif 'video' in m.mimetype %}
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
Your browser does not support the video element.
</video>
<div>
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
Your browser does not support the video element.
</video>
</div>
{% elif 'audio' in m.mimetype %}
<audio controls>
<source src="{{ url }}" type="{{ m.mimetype }}">
Your browser does not support the audio element.
</audio>
<div>
<audio controls>
<source src="{{ url }}" type="{{ m.mimetype }}">
Your browser does not support the audio element.
</audio>
</div>
{% elif m.filename | get_extension == ".wacz" %}
<a href="https://replayweb.page/?source={{ url | quote }}#view=pages&url={{ main_url }}">replayweb</a>
{% else %}
@ -26,9 +45,9 @@ No preview available for {{ m.key }}.
{{ m.url | urlize }}
{% endif %}
{% if links %}
<br>
<a href="{{ url }}">open</a> or
<a href="{{ url }}" download="">download</a>
<br>
{% endif %}
{% endfor %}

Wyświetl plik

@ -20,13 +20,15 @@ class LocalStorage(Storage):
@staticmethod
def configs() -> dict:
return {
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
"flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"},
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"},
}
return dict(
Storage.configs(),
** {
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
})
def get_cdn_url(self, media: Media) -> str:
#TODO: is this viable with Storage.configs on path/filename?
dest = os.path.join(self.save_to, media.key)
if self.save_absolute:
dest = os.path.abspath(dest)
@ -34,14 +36,12 @@ class LocalStorage(Storage):
def upload(self, media: Media, **kwargs) -> bool:
# override parent so that we can use shutil.copy2 and keep metadata
if self.flatten:
dest = os.path.join(self.save_to, slugify(media.key))
else:
dest = os.path.join(self.save_to, media.key)
os.makedirs(dest, exist_ok=True)
dest = os.path.join(self.save_to, media.key)
os.makedirs(os.path.dirname(dest), exist_ok=True)
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}')
shutil.copy2(media.filename, dest)
res = shutil.copy2(media.filename, dest)
logger.info(res)
return True
# must be implemented even if unused
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass

Wyświetl plik

@ -24,23 +24,25 @@ class S3Storage(Storage):
@staticmethod
def configs() -> dict:
return {
"bucket": {"default": None, "help": "S3 bucket name"},
"region": {"default": None, "help": "S3 region name"},
"key": {"default": None, "help": "S3 API key"},
"secret": {"default": None, "help": "S3 API secret"},
# TODO: how to have sth like a custom folder? has to come from the feeders
"endpoint_url": {
"default": 'https://{region}.digitaloceanspaces.com',
"help": "S3 bucket endpoint, {region} are inserted at runtime"
},
"cdn_url": {
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
},
"private": {"default": False, "help": "if true S3 files will not be readable online"},
# "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
}
return dict(
Storage.configs(),
** {
"bucket": {"default": None, "help": "S3 bucket name"},
"region": {"default": None, "help": "S3 region name"},
"key": {"default": None, "help": "S3 API key"},
"secret": {"default": None, "help": "S3 API secret"},
# TODO: how to have sth like a custom folder? has to come from the feeders
"endpoint_url": {
"default": 'https://{region}.digitaloceanspaces.com',
"help": "S3 bucket endpoint, {region} are inserted at runtime"
},
"cdn_url": {
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
},
"private": {"default": False, "help": "if true S3 files will not be readable online"},
# "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
})
def get_cdn_url(self, media: Media) -> str:
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)

Wyświetl plik

@ -1,6 +1,7 @@
from __future__ import annotations
from abc import abstractmethod
from dataclasses import dataclass
import hashlib
from typing import IO, Any
from ..core import Media, Metadata, Step
@ -12,13 +13,32 @@ from slugify import slugify
@dataclass
class Storage(Step):
name = "storage"
PATH_GENERATOR_OPTIONS = ["flat", "url", "random"]
FILENAME_GENERATOR_CHOICES = ["random", "static"]
def __init__(self, config: dict) -> None:
# without this STEP.__init__ is not called
super().__init__(config)
assert self.path_generator in Storage.PATH_GENERATOR_OPTIONS, f"path_generator must be one of {Storage.PATH_GENERATOR_OPTIONS}"
assert self.filename_generator in Storage.FILENAME_GENERATOR_CHOICES, f"filename_generator must be one of {Storage.FILENAME_GENERATOR_CHOICES}"
@staticmethod
def configs() -> dict:
return {
"path_generator": {
"default": "url",
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
"choices": Storage.PATH_GENERATOR_OPTIONS
},
"filename_generator": {
"default": "random",
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
"choices": Storage.FILENAME_GENERATOR_CHOICES
}
}
# only for typing...
def init(name: str, config: dict) -> Storage:
# only for typing...
return Step.init(name, config, Storage)
def store(self, media: Media, item: Metadata) -> None:
@ -38,10 +58,24 @@ class Storage(Step):
return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, item: Metadata) -> None:
#TODO: accept options to make these predictable or random
"""takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0: return
folder = item.get("folder", "")
ext = os.path.splitext(media.filename)[1]
# media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}")
filename, ext = os.path.splitext(media.filename)
# path_generator logic
if self.path_generator == "flat":
path = ""
filename = slugify(filename) # in case it comes with os.sep
elif self.path_generator == "url": path = slugify(item.get_url())
elif self.path_generator == "random":
path = item.get("random_path", str(uuid.uuid4())[:16], True)
# filename_generator logic
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
elif self.filename_generator == "static":
with open(media.filename, "rb") as f:
bytes = f.read() # read entire file as bytes
filename = hashlib.sha256(bytes).hexdigest()[:24]
media.key = os.path.join(folder, path, f"{filename}{ext}")