local storage + multiple storage support

pull/72/head
msramalho 2023-01-12 02:09:39 +00:00
rodzic 0cb593fd21
commit 6ca46417fe
11 zmienionych plików z 117 dodań i 54 usunięć

Wyświetl plik

@ -132,11 +132,11 @@ class TelethonArchiver(Archiverv2):
if mp.entities: if mp.entities:
other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]] other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
if len(other_media_urls): if len(other_media_urls):
logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
for i, om_url in enumerate(other_media_urls): for i, om_url in enumerate(other_media_urls):
filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}') filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
self.download_from_url(om_url, filename) self.download_from_url(om_url, filename)
result.add_media(Media(filename)) result.add_media(Media(filename=filename, id=f"{group_id}_{i}"))
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id)) filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
filename = self.client.download_media(mp.media, filename_dest) filename = self.client.download_media(mp.media, filename_dest)

Wyświetl plik

@ -63,13 +63,13 @@ class GsheetsDb(Database):
media: Media = item.get_single_media() media: Media = item.get_single_media()
batch_if_valid('archive', media.cdn_url) batch_if_valid('archive', "\n".join(media.urls))
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
batch_if_valid('title', item.get_title()) batch_if_valid('title', item.get_title())
batch_if_valid('text', item.get("content", "")[:500]) batch_if_valid('text', item.get("content", "")[:500])
batch_if_valid('timestamp', item.get_timestamp()) batch_if_valid('timestamp', item.get_timestamp())
if (screenshot := item.get_media_by_id("screenshot")): if (screenshot := item.get_media_by_id("screenshot")):
batch_if_valid('screenshot', screenshot.cdn_url) batch_if_valid('screenshot', "\n".join(screenshot.urls))
# batch_if_valid('status', item.status) # batch_if_valid('status', item.status)
# TODO: AFTER ENRICHMENTS # TODO: AFTER ENRICHMENTS

Wyświetl plik

@ -21,7 +21,7 @@ class WaybackEnricher(Enricher):
@staticmethod @staticmethod
def configs() -> dict: def configs() -> dict:
return { return {
"timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."}, "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
"key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
"secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"} "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
} }

Wyświetl plik

@ -26,6 +26,7 @@
table, table,
th, th,
td { td {
margin: auto;
border: 1px solid; border: 1px solid;
border-collapse: collapse; border-collapse: collapse;
} }
@ -43,18 +44,17 @@
<body> <body>
<h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2> <h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
<p>title: '<span>{{ title }}</span>'</p> <p><b>title:</b> '<span>{{ title }}</span>'</p>
<h2 class="center">content {{ media | length }} item(s)</h2> <h2 class="center">content {{ media | length }} item(s)</h2>
<table class="content"> <table class="content">
<tr> <tr>
<th>about</th> <th>about</th>
<th>preview</th> <th>preview(s)</th>
</tr> </tr>
{% for m in media %} {% for m in media %}
<tr> <tr>
<td> <td>
<ul> <ul>
<li><a href="{{ m.cdn_url }}">ARCHIVE</a></li>
{% if m.hash | length > 1 %} {% if m.hash | length > 1 %}
<li>hash: <span>{{ m.hash }}</span></li> <li>hash: <span>{{ m.hash }}</span></li>
{% endif %} {% endif %}
@ -67,25 +67,28 @@
</td> </td>
<td> <td>
{% for url in m.urls %}
{% if 'image' in m.mimetype %} {% if 'image' in m.mimetype %}
<img src="{{ m.cdn_url }}" style="max-height:400px;max-width:400px;"></img> <img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
{% elif 'video' in m.mimetype %} {% elif 'video' in m.mimetype %}
<video src="{{ m.cdn_url }}" controls style="max-height:400px;max-width:600px;"> <video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
Your browser does not support the video element. Your browser does not support the video element.
</video> </video>
{% elif 'audio' in m.mimetype %} {% elif 'audio' in m.mimetype %}
<audio controls> <audio controls>
<source src="{{ m.cdn_url }}" type="{{ m.mimetype }}"> <source src="{{ url }}" type="{{ m.mimetype }}">
Your browser does not support the audio element. Your browser does not support the audio element.
</audio> </audio>
{% else %} {% else %}
No preview available, please open the link. No preview available, please open the link.
{% endif %} {% endif %}
<li><a href="{{ url }}">{{ url}}</a></li>
{% endfor %}
</td> </td>
</tr> </tr>
{% endfor %} {% endfor %}
</table> </table>
<h2>metadata</h2> <h2 class="center">metadata</h2>
<table class="metadata"> <table class="metadata">
<tr> <tr>
<th>key</th> <th>key</th>
@ -100,7 +103,7 @@
</table> </table>
<hr> <hr>
<p>made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a>, add suggestions and report issues on the project's github page</p> <p style="text-align:center;">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a></p>
</body> </body>
</html> </html>

Wyświetl plik

@ -2,7 +2,7 @@
from __future__ import annotations from __future__ import annotations
from ast import List from ast import List
from typing import Any, Union, Dict from typing import Any, Union, Dict
from dataclasses import dataclass from dataclasses import dataclass, field
import mimetypes import mimetypes
@ -10,15 +10,25 @@ import mimetypes
class Media: class Media:
filename: str filename: str
key: str = None key: str = None
cdn_url: str = None urls: List[str] = field(default_factory=list)
mimetype: str = None # eg: image/jpeg _mimetype: str = None # eg: image/jpeg
id: str = None # in case this type of media needs a special id, eg: screenshot id: str = "" # in case this type of media needs a special id, eg: screenshot
# hash: str = None # TODO: added by enrichers # hash: str = None # TODO: added by enrichers
def set_mimetype(self) -> Media: def add_url(self, url: str) -> None:
if not self.mimetype: # url can be remote, local, ...
self.mimetype = mimetypes.guess_type(self.filename)[0] self.urls.append(url)
return self
@property # getter .mimetype
def mimetype(self) -> str:
assert self.filename is not None and len(self.filename) > 0, "cannot get mimetype from media without filename"
if not self._mimetype:
self._mimetype = mimetypes.guess_type(self.filename)[0]
return self._mimetype
@mimetype.setter # setter .mimetype
def mimetype(self, v: str) -> None:
self._mimetype = v
def is_video(self) -> bool: def is_video(self) -> bool:
return self.mimetype.startswith("video") return self._mimetype.startswith("video")

Wyświetl plik

@ -98,7 +98,6 @@ class Metadata:
def add_media(self, media: Media) -> Metadata: def add_media(self, media: Media) -> Metadata:
if media is None: return if media is None: return
media.set_mimetype()
return self.media.append(media) return self.media.append(media)
def get_media_by_id(self, id:str) -> Media: def get_media_by_id(self, id:str) -> Media:
@ -110,7 +109,6 @@ class Metadata:
if final: if final:
if self.final_media: if self.final_media:
logger.warning(f"overwriting final media value :{self.final_media} with {final}") logger.warning(f"overwriting final media value :{self.final_media} with {final}")
final.set_mimetype()
self.final_media = final self.final_media = final
return self return self

Wyświetl plik

@ -52,6 +52,7 @@ Cisticola considerations:
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping 2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
""" """
class ArchivingOrchestrator: class ArchivingOrchestrator:
def __init__(self, config) -> None: def __init__(self, config) -> None:
# in config.py we should test that the archivers exist and log mismatches (blocking execution) # in config.py we should test that the archivers exist and log mismatches (blocking execution)
@ -65,8 +66,8 @@ class ArchivingOrchestrator:
# Archiver.init(a, config) # Archiver.init(a, config)
# for a in config.archivers # for a in config.archivers
# ] # ]
self.feeder : Feeder = config.feeder self.feeder: Feeder = config.feeder
self.formatter : Formatter = config.formatter self.formatter: Formatter = config.formatter
self.enrichers = config.enrichers self.enrichers = config.enrichers
self.archivers: List[Archiverv2] = config.archivers self.archivers: List[Archiverv2] = config.archivers
self.databases: List[Database] = config.databases self.databases: List[Database] = config.databases
@ -173,11 +174,9 @@ class ArchivingOrchestrator:
e.enrich(result) e.enrich(result)
# store media # store media
unstored_media = result.media[::]
result.media = []
for s in self.storages: for s in self.storages:
for m in unstored_media: for m in result.media:
result.media.append(s.store(m, result)) s.store(m, result) # modifies media
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li> # formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
# TODO: should there only be 1 formatter? # TODO: should there only be 1 formatter?
@ -186,7 +185,8 @@ class ArchivingOrchestrator:
# final format and store it # final format and store it
if (final_media := self.formatter.format(result)): if (final_media := self.formatter.format(result)):
for s in self.storages: for s in self.storages:
result.set_final_media(s.store(final_media, result)) s.store(final_media, result)
result.set_final_media(final_media)
# signal completion to databases (DBs, Google Sheets, CSV, ...) # signal completion to databases (DBs, Google Sheets, CSV, ...)
# a hash registration service could be one database: forensic archiving # a hash registration service could be one database: forensic archiving

Wyświetl plik

@ -5,4 +5,5 @@ from .s3_storage import S3Config, S3Storage
from .gd_storage import GDConfig, GDStorage from .gd_storage import GDConfig, GDStorage
from .storage import StorageV2 from .storage import StorageV2
from .s3 import S3StorageV2 from .s3 import S3StorageV2
from .local import LocalStorageV2

Wyświetl plik

@ -0,0 +1,46 @@
import shutil
from typing import IO, Any
import boto3, uuid, os, mimetypes
from botocore.errorfactory import ClientError
from metadata import Metadata
from media import Media
from storages import StorageV2
from loguru import logger
from slugify import slugify
class LocalStorageV2(StorageV2):
name = "local_storage"
def __init__(self, config: dict) -> None:
super().__init__(config)
os.makedirs(self.save_to, exist_ok=True)
@staticmethod
def configs() -> dict:
return {
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
"flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"},
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"},
}
def get_cdn_url(self, media: Media) -> str:
dest = os.path.join(self.save_to, media.key)
if self.save_absolute:
dest = os.path.abspath(dest)
return dest
def upload(self, media: Media, **kwargs) -> bool:
# override parent so that we can use shutil.copy2 and keep metadata
if self.flatten:
dest = os.path.join(self.save_to, slugify(media.key))
else:
dest = os.path.join(self.save_to, media.key)
os.makedirs(dest, exist_ok=True)
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}')
shutil.copy2(media.filename, dest)
return True
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass

Wyświetl plik

@ -45,26 +45,27 @@ class S3StorageV2(StorageV2):
def get_cdn_url(self, media: Media) -> str: def get_cdn_url(self, media: Media) -> str:
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key) return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> Any: def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
extra_args = kwargs.get("extra_args", {}) extra_args = kwargs.get("extra_args", {})
if not self.private and 'ACL' not in extra_args: if not self.private and 'ACL' not in extra_args:
extra_args['ACL'] = 'public-read' extra_args['ACL'] = 'public-read'
if 'ContentType' not in extra_args: if 'ContentType' not in extra_args:
try: try:
extra_args['ContentType'] = mimetypes.guess_type(media.key)[0] extra_args['ContentType'] = media.mimetype
except Exception as e: except Exception as e:
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}") logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
return True
def exists(self, key: str) -> bool: # def exists(self, key: str) -> bool:
""" # """
Tests if a given file with key=key exists in the bucket # Tests if a given file with key=key exists in the bucket
""" # """
try: # try:
self.s3.head_object(Bucket=self.bucket, Key=key) # self.s3.head_object(Bucket=self.bucket, Key=key)
return True # return True
except ClientError as e: # except ClientError as e:
logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}") # logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
return False # return False

Wyświetl plik

@ -7,6 +7,7 @@ from metadata import Metadata
from steps.step import Step from steps.step import Step
from loguru import logger from loguru import logger
import os, uuid import os, uuid
from slugify import slugify
@dataclass @dataclass
@ -21,23 +22,26 @@ class StorageV2(Step):
def init(name: str, config: dict) -> StorageV2: def init(name: str, config: dict) -> StorageV2:
return Step.init(name, config, StorageV2) return Step.init(name, config, StorageV2)
def store(self, media: Media, item: Metadata) -> Media: def store(self, media: Media, item: Metadata) -> None:
media = self.set_key(media, item) self.set_key(media, item)
self.upload(media) self.upload(media)
media.cdn_url = self.get_cdn_url(media) media.add_url(self.get_cdn_url(media))
return media
@abstractmethod @abstractmethod
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> Any: pass def get_cdn_url(self, media: Media) -> str: pass
def upload(self, media: Media, **kwargs) -> Any: @abstractmethod
logger.debug(f'[{self.__class__.name}] uploading file {media.filename} with key {media.key}') def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
def upload(self, media: Media, **kwargs) -> bool:
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
with open(media.filename, 'rb') as f: with open(media.filename, 'rb') as f:
return self.uploadf(f, media, **kwargs) return self.uploadf(f, media, **kwargs)
def set_key(self, media: Media, item: Metadata) -> Media: def set_key(self, media: Media, item: Metadata) -> None:
"""takes the media and optionally item info and generates a key""" """takes the media and optionally item info and generates a key"""
if media.key is not None and len(media.key) > 0: return
folder = item.get("folder", "") folder = item.get("folder", "")
ext = os.path.splitext(media.filename)[1] ext = os.path.splitext(media.filename)[1]
media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}") # media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
return media media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}")