local storage + multiple storage support

pull/72/head
msramalho 2023-01-12 02:09:39 +00:00
rodzic 0cb593fd21
commit 6ca46417fe
11 zmienionych plików z 117 dodań i 54 usunięć

Wyświetl plik

@ -132,11 +132,11 @@ class TelethonArchiver(Archiverv2):
if mp.entities:
other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
if len(other_media_urls):
logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}")
logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
for i, om_url in enumerate(other_media_urls):
filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
self.download_from_url(om_url, filename)
result.add_media(Media(filename))
result.add_media(Media(filename=filename, id=f"{group_id}_{i}"))
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
filename = self.client.download_media(mp.media, filename_dest)

Wyświetl plik

@ -63,13 +63,13 @@ class GsheetsDb(Database):
media: Media = item.get_single_media()
batch_if_valid('archive', media.cdn_url)
batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", "")[:500])
batch_if_valid('timestamp', item.get_timestamp())
if (screenshot := item.get_media_by_id("screenshot")):
batch_if_valid('screenshot', screenshot.cdn_url)
batch_if_valid('screenshot', "\n".join(screenshot.urls))
# batch_if_valid('status', item.status)
# TODO: AFTER ENRICHMENTS

Wyświetl plik

@ -21,7 +21,7 @@ class WaybackEnricher(Enricher):
@staticmethod
def configs() -> dict:
return {
"timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."},
"timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
"key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
"secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
}

Wyświetl plik

@ -26,6 +26,7 @@
table,
th,
td {
margin: auto;
border: 1px solid;
border-collapse: collapse;
}
@ -43,18 +44,17 @@
<body>
<h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
<p>title: '<span>{{ title }}</span>'</p>
<p><b>title:</b> '<span>{{ title }}</span>'</p>
<h2 class="center">content {{ media | length }} item(s)</h2>
<table class="content">
<tr>
<th>about</th>
<th>preview</th>
<th>preview(s)</th>
</tr>
{% for m in media %}
<tr>
<td>
<ul>
<li><a href="{{ m.cdn_url }}">ARCHIVE</a></li>
{% if m.hash | length > 1 %}
<li>hash: <span>{{ m.hash }}</span></li>
{% endif %}
@ -67,25 +67,28 @@
</td>
<td>
{% for url in m.urls %}
{% if 'image' in m.mimetype %}
<img src="{{ m.cdn_url }}" style="max-height:400px;max-width:400px;"></img>
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
{% elif 'video' in m.mimetype %}
<video src="{{ m.cdn_url }}" controls style="max-height:400px;max-width:600px;">
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
Your browser does not support the video element.
</video>
{% elif 'audio' in m.mimetype %}
<audio controls>
<source src="{{ m.cdn_url }}" type="{{ m.mimetype }}">
<source src="{{ url }}" type="{{ m.mimetype }}">
Your browser does not support the audio element.
</audio>
{% else %}
No preview available, please open the link.
{% endif %}
<li><a href="{{ url }}">{{ url}}</a></li>
{% endfor %}
</td>
</tr>
{% endfor %}
</table>
<h2>metadata</h2>
<h2 class="center">metadata</h2>
<table class="metadata">
<tr>
<th>key</th>
@ -100,7 +103,7 @@
</table>
<hr>
<p>made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a>, add suggestions and report issues on the project's github page</p>
<p style="text-align:center;">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a></p>
</body>
</html>

Wyświetl plik

@ -2,7 +2,7 @@
from __future__ import annotations
from ast import List
from typing import Any, Union, Dict
from dataclasses import dataclass
from dataclasses import dataclass, field
import mimetypes
@ -10,15 +10,25 @@ import mimetypes
class Media:
filename: str
key: str = None
cdn_url: str = None
mimetype: str = None # eg: image/jpeg
id: str = None # in case this type of media needs a special id, eg: screenshot
urls: List[str] = field(default_factory=list)
_mimetype: str = None # eg: image/jpeg
id: str = "" # in case this type of media needs a special id, eg: screenshot
# hash: str = None # TODO: added by enrichers
def set_mimetype(self) -> Media:
if not self.mimetype:
self.mimetype = mimetypes.guess_type(self.filename)[0]
return self
def add_url(self, url: str) -> None:
# url can be remote, local, ...
self.urls.append(url)
@property # getter .mimetype
def mimetype(self) -> str:
assert self.filename is not None and len(self.filename) > 0, "cannot get mimetype from media without filename"
if not self._mimetype:
self._mimetype = mimetypes.guess_type(self.filename)[0]
return self._mimetype
@mimetype.setter # setter .mimetype
def mimetype(self, v: str) -> None:
self._mimetype = v
def is_video(self) -> bool:
return self.mimetype.startswith("video")
return self._mimetype.startswith("video")

Wyświetl plik

@ -98,7 +98,6 @@ class Metadata:
def add_media(self, media: Media) -> Metadata:
if media is None: return
media.set_mimetype()
return self.media.append(media)
def get_media_by_id(self, id:str) -> Media:
@ -110,7 +109,6 @@ class Metadata:
if final:
if self.final_media:
logger.warning(f"overwriting final media value :{self.final_media} with {final}")
final.set_mimetype()
self.final_media = final
return self

Wyświetl plik

@ -52,6 +52,7 @@ Cisticola considerations:
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
"""
class ArchivingOrchestrator:
def __init__(self, config) -> None:
# in config.py we should test that the archivers exist and log mismatches (blocking execution)
@ -65,8 +66,8 @@ class ArchivingOrchestrator:
# Archiver.init(a, config)
# for a in config.archivers
# ]
self.feeder : Feeder = config.feeder
self.formatter : Formatter = config.formatter
self.feeder: Feeder = config.feeder
self.formatter: Formatter = config.formatter
self.enrichers = config.enrichers
self.archivers: List[Archiverv2] = config.archivers
self.databases: List[Database] = config.databases
@ -173,11 +174,9 @@ class ArchivingOrchestrator:
e.enrich(result)
# store media
unstored_media = result.media[::]
result.media = []
for s in self.storages:
for m in unstored_media:
result.media.append(s.store(m, result))
for m in result.media:
s.store(m, result) # modifies media
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
# TODO: should there only be 1 formatter?
@ -186,7 +185,8 @@ class ArchivingOrchestrator:
# final format and store it
if (final_media := self.formatter.format(result)):
for s in self.storages:
result.set_final_media(s.store(final_media, result))
s.store(final_media, result)
result.set_final_media(final_media)
# signal completion to databases (DBs, Google Sheets, CSV, ...)
# a hash registration service could be one database: forensic archiving

Wyświetl plik

@ -5,4 +5,5 @@ from .s3_storage import S3Config, S3Storage
from .gd_storage import GDConfig, GDStorage
from .storage import StorageV2
from .s3 import S3StorageV2
from .s3 import S3StorageV2
from .local import LocalStorageV2

Wyświetl plik

@ -0,0 +1,46 @@
import shutil
from typing import IO, Any
import boto3, uuid, os, mimetypes
from botocore.errorfactory import ClientError
from metadata import Metadata
from media import Media
from storages import StorageV2
from loguru import logger
from slugify import slugify
class LocalStorageV2(StorageV2):
name = "local_storage"
def __init__(self, config: dict) -> None:
super().__init__(config)
os.makedirs(self.save_to, exist_ok=True)
@staticmethod
def configs() -> dict:
return {
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
"flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"},
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"},
}
def get_cdn_url(self, media: Media) -> str:
dest = os.path.join(self.save_to, media.key)
if self.save_absolute:
dest = os.path.abspath(dest)
return dest
def upload(self, media: Media, **kwargs) -> bool:
# override parent so that we can use shutil.copy2 and keep metadata
if self.flatten:
dest = os.path.join(self.save_to, slugify(media.key))
else:
dest = os.path.join(self.save_to, media.key)
os.makedirs(dest, exist_ok=True)
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}')
shutil.copy2(media.filename, dest)
return True
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass

Wyświetl plik

@ -45,26 +45,27 @@ class S3StorageV2(StorageV2):
def get_cdn_url(self, media: Media) -> str:
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> Any:
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
extra_args = kwargs.get("extra_args", {})
if not self.private and 'ACL' not in extra_args:
extra_args['ACL'] = 'public-read'
if 'ContentType' not in extra_args:
try:
extra_args['ContentType'] = mimetypes.guess_type(media.key)[0]
extra_args['ContentType'] = media.mimetype
except Exception as e:
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
return True
def exists(self, key: str) -> bool:
"""
Tests if a given file with key=key exists in the bucket
"""
try:
self.s3.head_object(Bucket=self.bucket, Key=key)
return True
except ClientError as e:
logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
return False
# def exists(self, key: str) -> bool:
# """
# Tests if a given file with key=key exists in the bucket
# """
# try:
# self.s3.head_object(Bucket=self.bucket, Key=key)
# return True
# except ClientError as e:
# logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
# return False

Wyświetl plik

@ -7,6 +7,7 @@ from metadata import Metadata
from steps.step import Step
from loguru import logger
import os, uuid
from slugify import slugify
@dataclass
@ -21,23 +22,26 @@ class StorageV2(Step):
def init(name: str, config: dict) -> StorageV2:
return Step.init(name, config, StorageV2)
def store(self, media: Media, item: Metadata) -> Media:
media = self.set_key(media, item)
def store(self, media: Media, item: Metadata) -> None:
self.set_key(media, item)
self.upload(media)
media.cdn_url = self.get_cdn_url(media)
return media
media.add_url(self.get_cdn_url(media))
@abstractmethod
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> Any: pass
def get_cdn_url(self, media: Media) -> str: pass
def upload(self, media: Media, **kwargs) -> Any:
logger.debug(f'[{self.__class__.name}] uploading file {media.filename} with key {media.key}')
@abstractmethod
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
def upload(self, media: Media, **kwargs) -> bool:
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
with open(media.filename, 'rb') as f:
return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, item: Metadata) -> Media:
def set_key(self, media: Media, item: Metadata) -> None:
"""takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0: return
folder = item.get("folder", "")
ext = os.path.splitext(media.filename)[1]
media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
return media
# media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}")