kopia lustrzana https://github.com/bellingcat/auto-archiver
local storage + multiple storage support
rodzic
0cb593fd21
commit
6ca46417fe
|
@ -132,11 +132,11 @@ class TelethonArchiver(Archiverv2):
|
|||
if mp.entities:
|
||||
other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
|
||||
if len(other_media_urls):
|
||||
logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}")
|
||||
logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
|
||||
for i, om_url in enumerate(other_media_urls):
|
||||
filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
|
||||
self.download_from_url(om_url, filename)
|
||||
result.add_media(Media(filename))
|
||||
result.add_media(Media(filename=filename, id=f"{group_id}_{i}"))
|
||||
|
||||
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
|
||||
filename = self.client.download_media(mp.media, filename_dest)
|
||||
|
|
|
@ -63,13 +63,13 @@ class GsheetsDb(Database):
|
|||
|
||||
media: Media = item.get_single_media()
|
||||
|
||||
batch_if_valid('archive', media.cdn_url)
|
||||
batch_if_valid('archive', "\n".join(media.urls))
|
||||
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||
batch_if_valid('title', item.get_title())
|
||||
batch_if_valid('text', item.get("content", "")[:500])
|
||||
batch_if_valid('timestamp', item.get_timestamp())
|
||||
if (screenshot := item.get_media_by_id("screenshot")):
|
||||
batch_if_valid('screenshot', screenshot.cdn_url)
|
||||
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
||||
# batch_if_valid('status', item.status)
|
||||
|
||||
# TODO: AFTER ENRICHMENTS
|
||||
|
|
|
@ -21,7 +21,7 @@ class WaybackEnricher(Enricher):
|
|||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."},
|
||||
"timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
|
||||
"key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
|
||||
"secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@
|
|||
table,
|
||||
th,
|
||||
td {
|
||||
margin: auto;
|
||||
border: 1px solid;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
@ -43,18 +44,17 @@
|
|||
|
||||
<body>
|
||||
<h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
|
||||
<p>title: '<span>{{ title }}</span>'</p>
|
||||
<p><b>title:</b> '<span>{{ title }}</span>'</p>
|
||||
<h2 class="center">content {{ media | length }} item(s)</h2>
|
||||
<table class="content">
|
||||
<tr>
|
||||
<th>about</th>
|
||||
<th>preview</th>
|
||||
<th>preview(s)</th>
|
||||
</tr>
|
||||
{% for m in media %}
|
||||
<tr>
|
||||
<td>
|
||||
<ul>
|
||||
<li><a href="{{ m.cdn_url }}">ARCHIVE</a></li>
|
||||
{% if m.hash | length > 1 %}
|
||||
<li>hash: <span>{{ m.hash }}</span></li>
|
||||
{% endif %}
|
||||
|
@ -67,25 +67,28 @@
|
|||
|
||||
</td>
|
||||
<td>
|
||||
{% for url in m.urls %}
|
||||
{% if 'image' in m.mimetype %}
|
||||
<img src="{{ m.cdn_url }}" style="max-height:400px;max-width:400px;"></img>
|
||||
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
|
||||
{% elif 'video' in m.mimetype %}
|
||||
<video src="{{ m.cdn_url }}" controls style="max-height:400px;max-width:600px;">
|
||||
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
|
||||
Your browser does not support the video element.
|
||||
</video>
|
||||
{% elif 'audio' in m.mimetype %}
|
||||
<audio controls>
|
||||
<source src="{{ m.cdn_url }}" type="{{ m.mimetype }}">
|
||||
<source src="{{ url }}" type="{{ m.mimetype }}">
|
||||
Your browser does not support the audio element.
|
||||
</audio>
|
||||
{% else %}
|
||||
No preview available, please open the link.
|
||||
{% endif %}
|
||||
<li><a href="{{ url }}">{{ url}}</a></li>
|
||||
{% endfor %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
<h2>metadata</h2>
|
||||
<h2 class="center">metadata</h2>
|
||||
<table class="metadata">
|
||||
<tr>
|
||||
<th>key</th>
|
||||
|
@ -100,7 +103,7 @@
|
|||
</table>
|
||||
|
||||
<hr>
|
||||
<p>made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a>, add suggestions and report issues on the project's github page</p>
|
||||
<p style="text-align:center;">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a></p>
|
||||
</body>
|
||||
|
||||
</html>
|
28
src/media.py
28
src/media.py
|
@ -2,7 +2,7 @@
|
|||
from __future__ import annotations
|
||||
from ast import List
|
||||
from typing import Any, Union, Dict
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
import mimetypes
|
||||
|
||||
|
||||
|
@ -10,15 +10,25 @@ import mimetypes
|
|||
class Media:
|
||||
filename: str
|
||||
key: str = None
|
||||
cdn_url: str = None
|
||||
mimetype: str = None # eg: image/jpeg
|
||||
id: str = None # in case this type of media needs a special id, eg: screenshot
|
||||
urls: List[str] = field(default_factory=list)
|
||||
_mimetype: str = None # eg: image/jpeg
|
||||
id: str = "" # in case this type of media needs a special id, eg: screenshot
|
||||
# hash: str = None # TODO: added by enrichers
|
||||
|
||||
def set_mimetype(self) -> Media:
|
||||
if not self.mimetype:
|
||||
self.mimetype = mimetypes.guess_type(self.filename)[0]
|
||||
return self
|
||||
def add_url(self, url: str) -> None:
|
||||
# url can be remote, local, ...
|
||||
self.urls.append(url)
|
||||
|
||||
@property # getter .mimetype
|
||||
def mimetype(self) -> str:
|
||||
assert self.filename is not None and len(self.filename) > 0, "cannot get mimetype from media without filename"
|
||||
if not self._mimetype:
|
||||
self._mimetype = mimetypes.guess_type(self.filename)[0]
|
||||
return self._mimetype
|
||||
|
||||
@mimetype.setter # setter .mimetype
|
||||
def mimetype(self, v: str) -> None:
|
||||
self._mimetype = v
|
||||
|
||||
def is_video(self) -> bool:
|
||||
return self.mimetype.startswith("video")
|
||||
return self._mimetype.startswith("video")
|
||||
|
|
|
@ -98,7 +98,6 @@ class Metadata:
|
|||
|
||||
def add_media(self, media: Media) -> Metadata:
|
||||
if media is None: return
|
||||
media.set_mimetype()
|
||||
return self.media.append(media)
|
||||
|
||||
def get_media_by_id(self, id:str) -> Media:
|
||||
|
@ -110,7 +109,6 @@ class Metadata:
|
|||
if final:
|
||||
if self.final_media:
|
||||
logger.warning(f"overwriting final media value :{self.final_media} with {final}")
|
||||
final.set_mimetype()
|
||||
self.final_media = final
|
||||
return self
|
||||
|
||||
|
|
|
@ -52,6 +52,7 @@ Cisticola considerations:
|
|||
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
|
||||
"""
|
||||
|
||||
|
||||
class ArchivingOrchestrator:
|
||||
def __init__(self, config) -> None:
|
||||
# in config.py we should test that the archivers exist and log mismatches (blocking execution)
|
||||
|
@ -65,8 +66,8 @@ class ArchivingOrchestrator:
|
|||
# Archiver.init(a, config)
|
||||
# for a in config.archivers
|
||||
# ]
|
||||
self.feeder : Feeder = config.feeder
|
||||
self.formatter : Formatter = config.formatter
|
||||
self.feeder: Feeder = config.feeder
|
||||
self.formatter: Formatter = config.formatter
|
||||
self.enrichers = config.enrichers
|
||||
self.archivers: List[Archiverv2] = config.archivers
|
||||
self.databases: List[Database] = config.databases
|
||||
|
@ -173,11 +174,9 @@ class ArchivingOrchestrator:
|
|||
e.enrich(result)
|
||||
|
||||
# store media
|
||||
unstored_media = result.media[::]
|
||||
result.media = []
|
||||
for s in self.storages:
|
||||
for m in unstored_media:
|
||||
result.media.append(s.store(m, result))
|
||||
for m in result.media:
|
||||
s.store(m, result) # modifies media
|
||||
|
||||
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
|
||||
# TODO: should there only be 1 formatter?
|
||||
|
@ -186,7 +185,8 @@ class ArchivingOrchestrator:
|
|||
# final format and store it
|
||||
if (final_media := self.formatter.format(result)):
|
||||
for s in self.storages:
|
||||
result.set_final_media(s.store(final_media, result))
|
||||
s.store(final_media, result)
|
||||
result.set_final_media(final_media)
|
||||
|
||||
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
||||
# a hash registration service could be one database: forensic archiving
|
||||
|
|
|
@ -5,4 +5,5 @@ from .s3_storage import S3Config, S3Storage
|
|||
from .gd_storage import GDConfig, GDStorage
|
||||
|
||||
from .storage import StorageV2
|
||||
from .s3 import S3StorageV2
|
||||
from .s3 import S3StorageV2
|
||||
from .local import LocalStorageV2
|
|
@ -0,0 +1,46 @@
|
|||
|
||||
import shutil
|
||||
from typing import IO, Any
|
||||
import boto3, uuid, os, mimetypes
|
||||
from botocore.errorfactory import ClientError
|
||||
from metadata import Metadata
|
||||
from media import Media
|
||||
from storages import StorageV2
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
|
||||
class LocalStorageV2(StorageV2):
|
||||
name = "local_storage"
|
||||
|
||||
def __init__(self, config: dict) -> None:
|
||||
super().__init__(config)
|
||||
os.makedirs(self.save_to, exist_ok=True)
|
||||
|
||||
@staticmethod
|
||||
def configs() -> dict:
|
||||
return {
|
||||
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
|
||||
"flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"},
|
||||
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"},
|
||||
}
|
||||
|
||||
def get_cdn_url(self, media: Media) -> str:
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
if self.save_absolute:
|
||||
dest = os.path.abspath(dest)
|
||||
return dest
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
# override parent so that we can use shutil.copy2 and keep metadata
|
||||
if self.flatten:
|
||||
dest = os.path.join(self.save_to, slugify(media.key))
|
||||
else:
|
||||
dest = os.path.join(self.save_to, media.key)
|
||||
|
||||
os.makedirs(dest, exist_ok=True)
|
||||
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}')
|
||||
shutil.copy2(media.filename, dest)
|
||||
return True
|
||||
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
|
@ -45,26 +45,27 @@ class S3StorageV2(StorageV2):
|
|||
def get_cdn_url(self, media: Media) -> str:
|
||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
|
||||
|
||||
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> Any:
|
||||
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
|
||||
extra_args = kwargs.get("extra_args", {})
|
||||
if not self.private and 'ACL' not in extra_args:
|
||||
extra_args['ACL'] = 'public-read'
|
||||
|
||||
if 'ContentType' not in extra_args:
|
||||
try:
|
||||
extra_args['ContentType'] = mimetypes.guess_type(media.key)[0]
|
||||
extra_args['ContentType'] = media.mimetype
|
||||
except Exception as e:
|
||||
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||
|
||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||
return True
|
||||
|
||||
def exists(self, key: str) -> bool:
|
||||
"""
|
||||
Tests if a given file with key=key exists in the bucket
|
||||
"""
|
||||
try:
|
||||
self.s3.head_object(Bucket=self.bucket, Key=key)
|
||||
return True
|
||||
except ClientError as e:
|
||||
logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
|
||||
return False
|
||||
# def exists(self, key: str) -> bool:
|
||||
# """
|
||||
# Tests if a given file with key=key exists in the bucket
|
||||
# """
|
||||
# try:
|
||||
# self.s3.head_object(Bucket=self.bucket, Key=key)
|
||||
# return True
|
||||
# except ClientError as e:
|
||||
# logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
|
||||
# return False
|
||||
|
|
|
@ -7,6 +7,7 @@ from metadata import Metadata
|
|||
from steps.step import Step
|
||||
from loguru import logger
|
||||
import os, uuid
|
||||
from slugify import slugify
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -21,23 +22,26 @@ class StorageV2(Step):
|
|||
def init(name: str, config: dict) -> StorageV2:
|
||||
return Step.init(name, config, StorageV2)
|
||||
|
||||
def store(self, media: Media, item: Metadata) -> Media:
|
||||
media = self.set_key(media, item)
|
||||
def store(self, media: Media, item: Metadata) -> None:
|
||||
self.set_key(media, item)
|
||||
self.upload(media)
|
||||
media.cdn_url = self.get_cdn_url(media)
|
||||
return media
|
||||
media.add_url(self.get_cdn_url(media))
|
||||
|
||||
@abstractmethod
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> Any: pass
|
||||
def get_cdn_url(self, media: Media) -> str: pass
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> Any:
|
||||
logger.debug(f'[{self.__class__.name}] uploading file {media.filename} with key {media.key}')
|
||||
@abstractmethod
|
||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
||||
|
||||
def upload(self, media: Media, **kwargs) -> bool:
|
||||
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
|
||||
with open(media.filename, 'rb') as f:
|
||||
return self.uploadf(f, media, **kwargs)
|
||||
|
||||
def set_key(self, media: Media, item: Metadata) -> Media:
|
||||
def set_key(self, media: Media, item: Metadata) -> None:
|
||||
"""takes the media and optionally item info and generates a key"""
|
||||
if media.key is not None and len(media.key) > 0: return
|
||||
folder = item.get("folder", "")
|
||||
ext = os.path.splitext(media.filename)[1]
|
||||
media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
|
||||
return media
|
||||
# media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
|
||||
media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}")
|
||||
|
|
Ładowanie…
Reference in New Issue