kopia lustrzana https://github.com/bellingcat/auto-archiver
local storage + multiple storage support
rodzic
0cb593fd21
commit
6ca46417fe
|
@ -132,11 +132,11 @@ class TelethonArchiver(Archiverv2):
|
||||||
if mp.entities:
|
if mp.entities:
|
||||||
other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
|
other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
|
||||||
if len(other_media_urls):
|
if len(other_media_urls):
|
||||||
logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}")
|
logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
|
||||||
for i, om_url in enumerate(other_media_urls):
|
for i, om_url in enumerate(other_media_urls):
|
||||||
filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
|
filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
|
||||||
self.download_from_url(om_url, filename)
|
self.download_from_url(om_url, filename)
|
||||||
result.add_media(Media(filename))
|
result.add_media(Media(filename=filename, id=f"{group_id}_{i}"))
|
||||||
|
|
||||||
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
|
filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
|
||||||
filename = self.client.download_media(mp.media, filename_dest)
|
filename = self.client.download_media(mp.media, filename_dest)
|
||||||
|
|
|
@ -63,13 +63,13 @@ class GsheetsDb(Database):
|
||||||
|
|
||||||
media: Media = item.get_single_media()
|
media: Media = item.get_single_media()
|
||||||
|
|
||||||
batch_if_valid('archive', media.cdn_url)
|
batch_if_valid('archive', "\n".join(media.urls))
|
||||||
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
|
||||||
batch_if_valid('title', item.get_title())
|
batch_if_valid('title', item.get_title())
|
||||||
batch_if_valid('text', item.get("content", "")[:500])
|
batch_if_valid('text', item.get("content", "")[:500])
|
||||||
batch_if_valid('timestamp', item.get_timestamp())
|
batch_if_valid('timestamp', item.get_timestamp())
|
||||||
if (screenshot := item.get_media_by_id("screenshot")):
|
if (screenshot := item.get_media_by_id("screenshot")):
|
||||||
batch_if_valid('screenshot', screenshot.cdn_url)
|
batch_if_valid('screenshot', "\n".join(screenshot.urls))
|
||||||
# batch_if_valid('status', item.status)
|
# batch_if_valid('status', item.status)
|
||||||
|
|
||||||
# TODO: AFTER ENRICHMENTS
|
# TODO: AFTER ENRICHMENTS
|
||||||
|
|
|
@ -21,7 +21,7 @@ class WaybackEnricher(Enricher):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def configs() -> dict:
|
def configs() -> dict:
|
||||||
return {
|
return {
|
||||||
"timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."},
|
"timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
|
||||||
"key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
|
"key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
|
||||||
"secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
|
"secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,7 @@
|
||||||
table,
|
table,
|
||||||
th,
|
th,
|
||||||
td {
|
td {
|
||||||
|
margin: auto;
|
||||||
border: 1px solid;
|
border: 1px solid;
|
||||||
border-collapse: collapse;
|
border-collapse: collapse;
|
||||||
}
|
}
|
||||||
|
@ -43,18 +44,17 @@
|
||||||
|
|
||||||
<body>
|
<body>
|
||||||
<h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
|
<h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
|
||||||
<p>title: '<span>{{ title }}</span>'</p>
|
<p><b>title:</b> '<span>{{ title }}</span>'</p>
|
||||||
<h2 class="center">content {{ media | length }} item(s)</h2>
|
<h2 class="center">content {{ media | length }} item(s)</h2>
|
||||||
<table class="content">
|
<table class="content">
|
||||||
<tr>
|
<tr>
|
||||||
<th>about</th>
|
<th>about</th>
|
||||||
<th>preview</th>
|
<th>preview(s)</th>
|
||||||
</tr>
|
</tr>
|
||||||
{% for m in media %}
|
{% for m in media %}
|
||||||
<tr>
|
<tr>
|
||||||
<td>
|
<td>
|
||||||
<ul>
|
<ul>
|
||||||
<li><a href="{{ m.cdn_url }}">ARCHIVE</a></li>
|
|
||||||
{% if m.hash | length > 1 %}
|
{% if m.hash | length > 1 %}
|
||||||
<li>hash: <span>{{ m.hash }}</span></li>
|
<li>hash: <span>{{ m.hash }}</span></li>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
@ -67,25 +67,28 @@
|
||||||
|
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
|
{% for url in m.urls %}
|
||||||
{% if 'image' in m.mimetype %}
|
{% if 'image' in m.mimetype %}
|
||||||
<img src="{{ m.cdn_url }}" style="max-height:400px;max-width:400px;"></img>
|
<img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
|
||||||
{% elif 'video' in m.mimetype %}
|
{% elif 'video' in m.mimetype %}
|
||||||
<video src="{{ m.cdn_url }}" controls style="max-height:400px;max-width:600px;">
|
<video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
|
||||||
Your browser does not support the video element.
|
Your browser does not support the video element.
|
||||||
</video>
|
</video>
|
||||||
{% elif 'audio' in m.mimetype %}
|
{% elif 'audio' in m.mimetype %}
|
||||||
<audio controls>
|
<audio controls>
|
||||||
<source src="{{ m.cdn_url }}" type="{{ m.mimetype }}">
|
<source src="{{ url }}" type="{{ m.mimetype }}">
|
||||||
Your browser does not support the audio element.
|
Your browser does not support the audio element.
|
||||||
</audio>
|
</audio>
|
||||||
{% else %}
|
{% else %}
|
||||||
No preview available, please open the link.
|
No preview available, please open the link.
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
<li><a href="{{ url }}">{{ url}}</a></li>
|
||||||
|
{% endfor %}
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</table>
|
</table>
|
||||||
<h2>metadata</h2>
|
<h2 class="center">metadata</h2>
|
||||||
<table class="metadata">
|
<table class="metadata">
|
||||||
<tr>
|
<tr>
|
||||||
<th>key</th>
|
<th>key</th>
|
||||||
|
@ -100,7 +103,7 @@
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
<hr>
|
<hr>
|
||||||
<p>made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a>, add suggestions and report issues on the project's github page</p>
|
<p style="text-align:center;">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a></p>
|
||||||
</body>
|
</body>
|
||||||
|
|
||||||
</html>
|
</html>
|
28
src/media.py
28
src/media.py
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from ast import List
|
from ast import List
|
||||||
from typing import Any, Union, Dict
|
from typing import Any, Union, Dict
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,15 +10,25 @@ import mimetypes
|
||||||
class Media:
|
class Media:
|
||||||
filename: str
|
filename: str
|
||||||
key: str = None
|
key: str = None
|
||||||
cdn_url: str = None
|
urls: List[str] = field(default_factory=list)
|
||||||
mimetype: str = None # eg: image/jpeg
|
_mimetype: str = None # eg: image/jpeg
|
||||||
id: str = None # in case this type of media needs a special id, eg: screenshot
|
id: str = "" # in case this type of media needs a special id, eg: screenshot
|
||||||
# hash: str = None # TODO: added by enrichers
|
# hash: str = None # TODO: added by enrichers
|
||||||
|
|
||||||
def set_mimetype(self) -> Media:
|
def add_url(self, url: str) -> None:
|
||||||
if not self.mimetype:
|
# url can be remote, local, ...
|
||||||
self.mimetype = mimetypes.guess_type(self.filename)[0]
|
self.urls.append(url)
|
||||||
return self
|
|
||||||
|
@property # getter .mimetype
|
||||||
|
def mimetype(self) -> str:
|
||||||
|
assert self.filename is not None and len(self.filename) > 0, "cannot get mimetype from media without filename"
|
||||||
|
if not self._mimetype:
|
||||||
|
self._mimetype = mimetypes.guess_type(self.filename)[0]
|
||||||
|
return self._mimetype
|
||||||
|
|
||||||
|
@mimetype.setter # setter .mimetype
|
||||||
|
def mimetype(self, v: str) -> None:
|
||||||
|
self._mimetype = v
|
||||||
|
|
||||||
def is_video(self) -> bool:
|
def is_video(self) -> bool:
|
||||||
return self.mimetype.startswith("video")
|
return self._mimetype.startswith("video")
|
||||||
|
|
|
@ -98,7 +98,6 @@ class Metadata:
|
||||||
|
|
||||||
def add_media(self, media: Media) -> Metadata:
|
def add_media(self, media: Media) -> Metadata:
|
||||||
if media is None: return
|
if media is None: return
|
||||||
media.set_mimetype()
|
|
||||||
return self.media.append(media)
|
return self.media.append(media)
|
||||||
|
|
||||||
def get_media_by_id(self, id:str) -> Media:
|
def get_media_by_id(self, id:str) -> Media:
|
||||||
|
@ -110,7 +109,6 @@ class Metadata:
|
||||||
if final:
|
if final:
|
||||||
if self.final_media:
|
if self.final_media:
|
||||||
logger.warning(f"overwriting final media value :{self.final_media} with {final}")
|
logger.warning(f"overwriting final media value :{self.final_media} with {final}")
|
||||||
final.set_mimetype()
|
|
||||||
self.final_media = final
|
self.final_media = final
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
@ -52,6 +52,7 @@ Cisticola considerations:
|
||||||
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
|
2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
class ArchivingOrchestrator:
|
class ArchivingOrchestrator:
|
||||||
def __init__(self, config) -> None:
|
def __init__(self, config) -> None:
|
||||||
# in config.py we should test that the archivers exist and log mismatches (blocking execution)
|
# in config.py we should test that the archivers exist and log mismatches (blocking execution)
|
||||||
|
@ -65,8 +66,8 @@ class ArchivingOrchestrator:
|
||||||
# Archiver.init(a, config)
|
# Archiver.init(a, config)
|
||||||
# for a in config.archivers
|
# for a in config.archivers
|
||||||
# ]
|
# ]
|
||||||
self.feeder : Feeder = config.feeder
|
self.feeder: Feeder = config.feeder
|
||||||
self.formatter : Formatter = config.formatter
|
self.formatter: Formatter = config.formatter
|
||||||
self.enrichers = config.enrichers
|
self.enrichers = config.enrichers
|
||||||
self.archivers: List[Archiverv2] = config.archivers
|
self.archivers: List[Archiverv2] = config.archivers
|
||||||
self.databases: List[Database] = config.databases
|
self.databases: List[Database] = config.databases
|
||||||
|
@ -173,11 +174,9 @@ class ArchivingOrchestrator:
|
||||||
e.enrich(result)
|
e.enrich(result)
|
||||||
|
|
||||||
# store media
|
# store media
|
||||||
unstored_media = result.media[::]
|
|
||||||
result.media = []
|
|
||||||
for s in self.storages:
|
for s in self.storages:
|
||||||
for m in unstored_media:
|
for m in result.media:
|
||||||
result.media.append(s.store(m, result))
|
s.store(m, result) # modifies media
|
||||||
|
|
||||||
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
|
# formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
|
||||||
# TODO: should there only be 1 formatter?
|
# TODO: should there only be 1 formatter?
|
||||||
|
@ -186,7 +185,8 @@ class ArchivingOrchestrator:
|
||||||
# final format and store it
|
# final format and store it
|
||||||
if (final_media := self.formatter.format(result)):
|
if (final_media := self.formatter.format(result)):
|
||||||
for s in self.storages:
|
for s in self.storages:
|
||||||
result.set_final_media(s.store(final_media, result))
|
s.store(final_media, result)
|
||||||
|
result.set_final_media(final_media)
|
||||||
|
|
||||||
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
# signal completion to databases (DBs, Google Sheets, CSV, ...)
|
||||||
# a hash registration service could be one database: forensic archiving
|
# a hash registration service could be one database: forensic archiving
|
||||||
|
|
|
@ -5,4 +5,5 @@ from .s3_storage import S3Config, S3Storage
|
||||||
from .gd_storage import GDConfig, GDStorage
|
from .gd_storage import GDConfig, GDStorage
|
||||||
|
|
||||||
from .storage import StorageV2
|
from .storage import StorageV2
|
||||||
from .s3 import S3StorageV2
|
from .s3 import S3StorageV2
|
||||||
|
from .local import LocalStorageV2
|
|
@ -0,0 +1,46 @@
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
from typing import IO, Any
|
||||||
|
import boto3, uuid, os, mimetypes
|
||||||
|
from botocore.errorfactory import ClientError
|
||||||
|
from metadata import Metadata
|
||||||
|
from media import Media
|
||||||
|
from storages import StorageV2
|
||||||
|
from loguru import logger
|
||||||
|
from slugify import slugify
|
||||||
|
|
||||||
|
|
||||||
|
class LocalStorageV2(StorageV2):
|
||||||
|
name = "local_storage"
|
||||||
|
|
||||||
|
def __init__(self, config: dict) -> None:
|
||||||
|
super().__init__(config)
|
||||||
|
os.makedirs(self.save_to, exist_ok=True)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def configs() -> dict:
|
||||||
|
return {
|
||||||
|
"save_to": {"default": "./archived", "help": "folder where to save archived content"},
|
||||||
|
"flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"},
|
||||||
|
"save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"},
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_cdn_url(self, media: Media) -> str:
|
||||||
|
dest = os.path.join(self.save_to, media.key)
|
||||||
|
if self.save_absolute:
|
||||||
|
dest = os.path.abspath(dest)
|
||||||
|
return dest
|
||||||
|
|
||||||
|
def upload(self, media: Media, **kwargs) -> bool:
|
||||||
|
# override parent so that we can use shutil.copy2 and keep metadata
|
||||||
|
if self.flatten:
|
||||||
|
dest = os.path.join(self.save_to, slugify(media.key))
|
||||||
|
else:
|
||||||
|
dest = os.path.join(self.save_to, media.key)
|
||||||
|
|
||||||
|
os.makedirs(dest, exist_ok=True)
|
||||||
|
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}')
|
||||||
|
shutil.copy2(media.filename, dest)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
|
@ -45,26 +45,27 @@ class S3StorageV2(StorageV2):
|
||||||
def get_cdn_url(self, media: Media) -> str:
|
def get_cdn_url(self, media: Media) -> str:
|
||||||
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
|
return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
|
||||||
|
|
||||||
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> Any:
|
def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
|
||||||
extra_args = kwargs.get("extra_args", {})
|
extra_args = kwargs.get("extra_args", {})
|
||||||
if not self.private and 'ACL' not in extra_args:
|
if not self.private and 'ACL' not in extra_args:
|
||||||
extra_args['ACL'] = 'public-read'
|
extra_args['ACL'] = 'public-read'
|
||||||
|
|
||||||
if 'ContentType' not in extra_args:
|
if 'ContentType' not in extra_args:
|
||||||
try:
|
try:
|
||||||
extra_args['ContentType'] = mimetypes.guess_type(media.key)[0]
|
extra_args['ContentType'] = media.mimetype
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
|
||||||
|
|
||||||
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
|
||||||
|
return True
|
||||||
|
|
||||||
def exists(self, key: str) -> bool:
|
# def exists(self, key: str) -> bool:
|
||||||
"""
|
# """
|
||||||
Tests if a given file with key=key exists in the bucket
|
# Tests if a given file with key=key exists in the bucket
|
||||||
"""
|
# """
|
||||||
try:
|
# try:
|
||||||
self.s3.head_object(Bucket=self.bucket, Key=key)
|
# self.s3.head_object(Bucket=self.bucket, Key=key)
|
||||||
return True
|
# return True
|
||||||
except ClientError as e:
|
# except ClientError as e:
|
||||||
logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
|
# logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
|
||||||
return False
|
# return False
|
||||||
|
|
|
@ -7,6 +7,7 @@ from metadata import Metadata
|
||||||
from steps.step import Step
|
from steps.step import Step
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import os, uuid
|
import os, uuid
|
||||||
|
from slugify import slugify
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -21,23 +22,26 @@ class StorageV2(Step):
|
||||||
def init(name: str, config: dict) -> StorageV2:
|
def init(name: str, config: dict) -> StorageV2:
|
||||||
return Step.init(name, config, StorageV2)
|
return Step.init(name, config, StorageV2)
|
||||||
|
|
||||||
def store(self, media: Media, item: Metadata) -> Media:
|
def store(self, media: Media, item: Metadata) -> None:
|
||||||
media = self.set_key(media, item)
|
self.set_key(media, item)
|
||||||
self.upload(media)
|
self.upload(media)
|
||||||
media.cdn_url = self.get_cdn_url(media)
|
media.add_url(self.get_cdn_url(media))
|
||||||
return media
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> Any: pass
|
def get_cdn_url(self, media: Media) -> str: pass
|
||||||
|
|
||||||
def upload(self, media: Media, **kwargs) -> Any:
|
@abstractmethod
|
||||||
logger.debug(f'[{self.__class__.name}] uploading file {media.filename} with key {media.key}')
|
def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
|
||||||
|
|
||||||
|
def upload(self, media: Media, **kwargs) -> bool:
|
||||||
|
logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
|
||||||
with open(media.filename, 'rb') as f:
|
with open(media.filename, 'rb') as f:
|
||||||
return self.uploadf(f, media, **kwargs)
|
return self.uploadf(f, media, **kwargs)
|
||||||
|
|
||||||
def set_key(self, media: Media, item: Metadata) -> Media:
|
def set_key(self, media: Media, item: Metadata) -> None:
|
||||||
"""takes the media and optionally item info and generates a key"""
|
"""takes the media and optionally item info and generates a key"""
|
||||||
|
if media.key is not None and len(media.key) > 0: return
|
||||||
folder = item.get("folder", "")
|
folder = item.get("folder", "")
|
||||||
ext = os.path.splitext(media.filename)[1]
|
ext = os.path.splitext(media.filename)[1]
|
||||||
media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
|
# media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
|
||||||
return media
|
media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}")
|
||||||
|
|
Ładowanie…
Reference in New Issue