kopia lustrzana https://github.com/bellingcat/auto-archiver
minor improvements
rodzic
092ffdb6d8
commit
9dd8afed8c
|
@ -11,18 +11,19 @@ from dateutil.parser import parse as parse_dt
|
|||
from .media import Media
|
||||
|
||||
# annotation order matters
|
||||
|
||||
|
||||
@dataclass_json
|
||||
@dataclass
|
||||
class Metadata:
|
||||
status: str = "no archiver"
|
||||
_processed_at: datetime = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
|
||||
_processed_at: datetime = field(default_factory=datetime.datetime.utcnow)
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude":True}) # keys that are not to be saved in DBs
|
||||
tmp_keys: Set[str] = field(default_factory=set, repr=False, metadata={"exclude": True}) # keys that are not to be saved in DBs
|
||||
media: List[Media] = field(default_factory=list)
|
||||
final_media: Media = None # can be overwritten by formatters
|
||||
rearchivable: bool = False
|
||||
|
||||
|
||||
def merge(self: Metadata, right: Metadata, overwrite_left=True) -> Metadata:
|
||||
"""
|
||||
merges two Metadata instances, will overwrite according to overwrite_left flag
|
||||
|
|
|
@ -33,6 +33,10 @@ class GsheetsFeeder(Gsheets, Feeder):
|
|||
"default": set(),
|
||||
"help": "(CSV) explicitly block some worksheets from being processed",
|
||||
"cli_set": lambda cli_val, cur_val: set(cli_val.split(","))
|
||||
},
|
||||
"use_sheet_names_in_stored_paths":{
|
||||
"default": True,
|
||||
"help": "if True the stored files path will include 'workbook_name/worksheet_name/...'",
|
||||
}
|
||||
})
|
||||
|
||||
|
@ -60,7 +64,10 @@ class GsheetsFeeder(Gsheets, Feeder):
|
|||
if status not in ['', None]: continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
yield Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True).set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
|
||||
m = Metadata().set_url(url).set("gsheet", {"row": row, "worksheet": gw}, True)
|
||||
if self.use_sheet_names_in_stored_paths:
|
||||
m.set("folder", os.path.join(slugify(self.sheet), slugify(wks.title)), True)
|
||||
yield m
|
||||
|
||||
logger.success(f'Finished worksheet {wks.title}')
|
||||
|
||||
|
|
|
@ -5,18 +5,37 @@
|
|||
No URL available for {{ m.key }}.
|
||||
{% elif 'http' in url %}
|
||||
{% if 'image' in m.mimetype %}
|
||||
<a href="{{ url }}">
|
||||
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
|
||||
</a>
|
||||
<div>
|
||||
<a href="{{ url }}">
|
||||
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
|
||||
</a>
|
||||
|
||||
<div>
|
||||
Reverse Image Search:
|
||||
<a href="https://www.google.com/searchbyimage?sbisrc=4chanx&image_url={{ url | quote }}&safe=off">Google</a>,
|
||||
<a href="https://lens.google.com/uploadbyurl?url={{ url | quote }}">Google Lens</a>,
|
||||
<a href="https://yandex.ru/images/touch/search?rpt=imageview&url={{ url | quote }}">Yandex</a>,
|
||||
<a href="https://www.bing.com/images/search?view=detailv2&iss=sbi&form=SBIVSP&sbisrc=UrlPaste&q=imgurl:{{ url | quote }}">Bing</a>,
|
||||
<a href="https://www.tineye.com/search/?url={{ url | quote }}">Tineye</a>,
|
||||
<a href="https://iqdb.org/?url={{ url | quote }}">IQDB</a>,
|
||||
<a href="https://saucenao.com/search.php?db=999&url={{ url | quote }}">SauceNAO</a>,
|
||||
<a href="https://imgops.com/{{ url | quote }}">IMGOPS</a>
|
||||
</div>
|
||||
<p></p>
|
||||
</div>
|
||||
{% elif 'video' in m.mimetype %}
|
||||
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
|
||||
Your browser does not support the video element.
|
||||
</video>
|
||||
<div>
|
||||
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
|
||||
Your browser does not support the video element.
|
||||
</video>
|
||||
</div>
|
||||
{% elif 'audio' in m.mimetype %}
|
||||
<audio controls>
|
||||
<source src="{{ url }}" type="{{ m.mimetype }}">
|
||||
Your browser does not support the audio element.
|
||||
</audio>
|
||||
<div>
|
||||
<audio controls>
|
||||
<source src="{{ url }}" type="{{ m.mimetype }}">
|
||||
Your browser does not support the audio element.
|
||||
</audio>
|
||||
</div>
|
||||
{% elif m.filename | get_extension == ".wacz" %}
|
||||
<a href="https://replayweb.page/?source={{ url | quote }}#view=pages&url={{ main_url }}">replayweb</a>
|
||||
{% else %}
|
||||
|
@ -26,9 +45,9 @@ No preview available for {{ m.key }}.
|
|||
{{ m.url | urlize }}
|
||||
{% endif %}
|
||||
{% if links %}
|
||||
<br>
|
||||
<a href="{{ url }}">open</a> or
|
||||
<a href="{{ url }}" download="">download</a>
|
||||
<br>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
|
|
|
@ -20,13 +20,15 @@ class LocalStorage(Storage):
|
|||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
|
||||
"flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"},
|
||||
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"},
|
||||
}
|
||||
return dict(
|
||||
Storage.configs(),
|
||||
** {
|
||||
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
|
||||
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative in the output result inc. formatters (WARN: leaks the file structure)"},
|
||||
})
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
#TODO: is this viable with Storage.configs on path/filename?
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
if self.save_absolute:
|
||||
dest = os.path.abspath(dest)
|
||||
|
@ -34,14 +36,12 @@ class LocalStorage(Storage):
|
|||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
# override parent so that we can use shutil.copy2 and keep metadata
|
||||
if self.flatten:
|
||||
dest = os.path.join(self.save_to, slugify(media.key))
|
||||
else:
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
|
||||
os.makedirs(dest, exist_ok=True)
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}')
|
||||
shutil.copy2(media.filename, dest)
|
||||
res = shutil.copy2(media.filename, dest)
|
||||
logger.info(res)
|
||||
return True
|
||||
|
||||
# must be implemented even if unused
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
||||
|
|
|
@ -24,23 +24,25 @@ class S3Storage(Storage):
|
|||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"bucket": {"default": None, "help": "S3 bucket name"},
|
||||
"region": {"default": None, "help": "S3 region name"},
|
||||
"key": {"default": None, "help": "S3 API key"},
|
||||
"secret": {"default": None, "help": "S3 API secret"},
|
||||
# TODO: how to have sth like a custom folder? has to come from the feeders
|
||||
"endpoint_url": {
|
||||
"default": 'https://{region}.digitaloceanspaces.com',
|
||||
"help": "S3 bucket endpoint, {region} are inserted at runtime"
|
||||
},
|
||||
"cdn_url": {
|
||||
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
|
||||
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
|
||||
},
|
||||
"private": {"default": False, "help": "if true S3 files will not be readable online"},
|
||||
# "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
|
||||
}
|
||||
return dict(
|
||||
Storage.configs(),
|
||||
** {
|
||||
"bucket": {"default": None, "help": "S3 bucket name"},
|
||||
"region": {"default": None, "help": "S3 region name"},
|
||||
"key": {"default": None, "help": "S3 API key"},
|
||||
"secret": {"default": None, "help": "S3 API secret"},
|
||||
# TODO: how to have sth like a custom folder? has to come from the feeders
|
||||
"endpoint_url": {
|
||||
"default": 'https://{region}.digitaloceanspaces.com',
|
||||
"help": "S3 bucket endpoint, {region} are inserted at runtime"
|
||||
},
|
||||
"cdn_url": {
|
||||
"default": 'https://{bucket}.{region}.cdn.digitaloceanspaces.com/{key}',
|
||||
"help": "S3 CDN url, {bucket}, {region} and {key} are inserted at runtime"
|
||||
},
|
||||
"private": {"default": False, "help": "if true S3 files will not be readable online"},
|
||||
# "key_path": {"default": "random", "help": "S3 file names are non-predictable strings, one of ['random', 'default']"},
|
||||
})
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from __future__ import annotations
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
import hashlib
|
||||
from typing import IO, Any
|
||||
|
||||
from ..core import Media, Metadata, Step
|
||||
|
@ -12,13 +13,32 @@ from slugify import slugify
|
|||
@dataclass
|
||||
class Storage(Step):
|
||||
name = "storage"
|
||||
PATH_GENERATOR_OPTIONS = ["flat", "url", "random"]
|
||||
FILENAME_GENERATOR_CHOICES = ["random", "static"]
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
# without this STEP.__init__ is not called
|
||||
super().__init__(config)
|
||||
assert self.path_generator in Storage.PATH_GENERATOR_OPTIONS, f"path_generator must be one of {Storage.PATH_GENERATOR_OPTIONS}"
|
||||
assert self.filename_generator in Storage.FILENAME_GENERATOR_CHOICES, f"filename_generator must be one of {Storage.FILENAME_GENERATOR_CHOICES}"
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"path_generator": {
|
||||
"default": "url",
|
||||
"help": "how to store the file in terms of directory structure: 'flat' sets to root; 'url' creates a directory based on the provided URL; 'random' creates a random directory.",
|
||||
"choices": Storage.PATH_GENERATOR_OPTIONS
|
||||
},
|
||||
"filename_generator": {
|
||||
"default": "random",
|
||||
"help": "how to name stored files: 'random' creates a random string; 'static' uses a replicable strategy such as a hash.",
|
||||
"choices": Storage.FILENAME_GENERATOR_CHOICES
|
||||
}
|
||||
}
|
||||
|
||||
# only for typing...
|
||||
def init(name: str, config: dict) -> Storage:
|
||||
# only for typing...
|
||||
return Step.init(name, config, Storage)
|
||||
|
||||
def store(self, media: Media, item: Metadata) -> None:
|
||||
|
@ -38,10 +58,24 @@ class Storage(Step):
|
|||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, item: Metadata) -> None:
|
||||
#TODO: accept options to make these predictable or random
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
if media.key is not None and len(media.key) > 0: return
|
||||
folder = item.get("folder", "")
|
||||
ext = os.path.splitext(media.filename)[1]
|
||||
# media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
|
||||
media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}")
|
||||
filename, ext = os.path.splitext(media.filename)
|
||||
|
||||
# path_generator logic
|
||||
if self.path_generator == "flat":
|
||||
path = ""
|
||||
filename = slugify(filename) # in case it comes with os.sep
|
||||
elif self.path_generator == "url": path = slugify(item.get_url())
|
||||
elif self.path_generator == "random":
|
||||
path = item.get("random_path", str(uuid.uuid4())[:16], True)
|
||||
|
||||
# filename_generator logic
|
||||
if self.filename_generator == "random": filename = str(uuid.uuid4())[:16]
|
||||
elif self.filename_generator == "static":
|
||||
with open(media.filename, "rb") as f:
|
||||
bytes = f.read() # read entire file as bytes
|
||||
filename = hashlib.sha256(bytes).hexdigest()[:24]
|
||||
|
||||
media.key = os.path.join(folder, path, f"{filename}{ext}")
|
Ładowanie…
Reference in New Issue