From 6ca46417feeda7f6ac586214cbf40917f9d9b50f Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Thu, 12 Jan 2023 02:09:39 +0000 Subject: [PATCH] local storage + multiple storage support --- src/archivers/telethon_archiverv2.py | 4 +- src/databases/gsheet_db.py | 4 +- src/enrichers/wayback_enricher.py | 2 +- src/formatters/templates/html_template.html | 19 +++++---- src/media.py | 28 +++++++++---- src/metadata.py | 2 - src/orchestrator.py | 14 +++---- src/storages/__init__.py | 3 +- src/storages/local.py | 46 +++++++++++++++++++++ src/storages/s3.py | 25 +++++------ src/storages/storage.py | 24 ++++++----- 11 files changed, 117 insertions(+), 54 deletions(-) create mode 100644 src/storages/local.py diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index 66ecd74..6851cb5 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -132,11 +132,11 @@ class TelethonArchiver(Archiverv2): if mp.entities: other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]] if len(other_media_urls): - logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}") + logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}") for i, om_url in enumerate(other_media_urls): filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}') self.download_from_url(om_url, filename) - result.add_media(Media(filename)) + result.add_media(Media(filename=filename, id=f"{group_id}_{i}")) filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id)) filename = self.client.download_media(mp.media, filename_dest) diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py index 26aae68..0cf65ed 100644 --- a/src/databases/gsheet_db.py +++ b/src/databases/gsheet_db.py @@ -63,13 +63,13 @@ class GsheetsDb(Database): media: Media = item.get_single_media() - batch_if_valid('archive', media.cdn_url) + batch_if_valid('archive', "\n".join(media.urls)) batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()) batch_if_valid('title', item.get_title()) batch_if_valid('text', item.get("content", "")[:500]) batch_if_valid('timestamp', item.get_timestamp()) if (screenshot := item.get_media_by_id("screenshot")): - batch_if_valid('screenshot', screenshot.cdn_url) + batch_if_valid('screenshot', "\n".join(screenshot.urls)) # batch_if_valid('status', item.status) # TODO: AFTER ENRICHMENTS diff --git a/src/enrichers/wayback_enricher.py b/src/enrichers/wayback_enricher.py index 09a43e0..bf55923 100644 --- a/src/enrichers/wayback_enricher.py +++ b/src/enrichers/wayback_enricher.py @@ -21,7 +21,7 @@ class WaybackEnricher(Enricher): @staticmethod def configs() -> dict: return { - "timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."}, + "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."}, "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"}, "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"} } diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index fc986f0..f488a5f 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -26,6 +26,7 @@ table, th, td { + margin: auto; border: 1px solid; border-collapse: collapse; } @@ -43,18 +44,17 @@

Archived media for {{ url }}

-

title: '{{ title }}'

+

title: '{{ title }}'

content {{ media | length }} item(s)

- + {% for m in media %} {% endfor %}
aboutpreviewpreview(s)
    -
  • ARCHIVE
  • {% if m.hash | length > 1 %}
  • hash: {{ m.hash }}
  • {% endif %} @@ -67,25 +67,28 @@
+ {% for url in m.urls %} {% if 'image' in m.mimetype %} - + {% elif 'video' in m.mimetype %} -
-

metadata

+

metadata

@@ -100,7 +103,7 @@
key

-

made with bellingcat/auto-archiver, add suggestions and report issues on the project's github page

+

Made with bellingcat/auto-archiver

\ No newline at end of file diff --git a/src/media.py b/src/media.py index 3c416be..e50cc14 100644 --- a/src/media.py +++ b/src/media.py @@ -2,7 +2,7 @@ from __future__ import annotations from ast import List from typing import Any, Union, Dict -from dataclasses import dataclass +from dataclasses import dataclass, field import mimetypes @@ -10,15 +10,25 @@ import mimetypes class Media: filename: str key: str = None - cdn_url: str = None - mimetype: str = None # eg: image/jpeg - id: str = None # in case this type of media needs a special id, eg: screenshot + urls: List[str] = field(default_factory=list) + _mimetype: str = None # eg: image/jpeg + id: str = "" # in case this type of media needs a special id, eg: screenshot # hash: str = None # TODO: added by enrichers - def set_mimetype(self) -> Media: - if not self.mimetype: - self.mimetype = mimetypes.guess_type(self.filename)[0] - return self + def add_url(self, url: str) -> None: + # url can be remote, local, ... + self.urls.append(url) + + @property # getter .mimetype + def mimetype(self) -> str: + assert self.filename is not None and len(self.filename) > 0, "cannot get mimetype from media without filename" + if not self._mimetype: + self._mimetype = mimetypes.guess_type(self.filename)[0] + return self._mimetype + + @mimetype.setter # setter .mimetype + def mimetype(self, v: str) -> None: + self._mimetype = v def is_video(self) -> bool: - return self.mimetype.startswith("video") + return self._mimetype.startswith("video") diff --git a/src/metadata.py b/src/metadata.py index 7af923c..7f57c3b 100644 --- a/src/metadata.py +++ b/src/metadata.py @@ -98,7 +98,6 @@ class Metadata: def add_media(self, media: Media) -> Metadata: if media is None: return - media.set_mimetype() return self.media.append(media) def get_media_by_id(self, id:str) -> Media: @@ -110,7 +109,6 @@ class Metadata: if final: if self.final_media: logger.warning(f"overwriting final media value :{self.final_media} with {final}") - final.set_mimetype() self.final_media = final return self diff --git a/src/orchestrator.py b/src/orchestrator.py index 3d554e0..612ea2b 100644 --- a/src/orchestrator.py +++ b/src/orchestrator.py @@ -52,6 +52,7 @@ Cisticola considerations: 2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping """ + class ArchivingOrchestrator: def __init__(self, config) -> None: # in config.py we should test that the archivers exist and log mismatches (blocking execution) @@ -65,8 +66,8 @@ class ArchivingOrchestrator: # Archiver.init(a, config) # for a in config.archivers # ] - self.feeder : Feeder = config.feeder - self.formatter : Formatter = config.formatter + self.feeder: Feeder = config.feeder + self.formatter: Formatter = config.formatter self.enrichers = config.enrichers self.archivers: List[Archiverv2] = config.archivers self.databases: List[Database] = config.databases @@ -173,11 +174,9 @@ class ArchivingOrchestrator: e.enrich(result) # store media - unstored_media = result.media[::] - result.media = [] for s in self.storages: - for m in unstored_media: - result.media.append(s.store(m, result)) + for m in result.media: + s.store(m, result) # modifies media # formatters, enrichers, and storages will sometimes look for specific properties: eg
  • Screenshot:
  • # TODO: should there only be 1 formatter? @@ -186,7 +185,8 @@ class ArchivingOrchestrator: # final format and store it if (final_media := self.formatter.format(result)): for s in self.storages: - result.set_final_media(s.store(final_media, result)) + s.store(final_media, result) + result.set_final_media(final_media) # signal completion to databases (DBs, Google Sheets, CSV, ...) # a hash registration service could be one database: forensic archiving diff --git a/src/storages/__init__.py b/src/storages/__init__.py index 91ce148..4c0783c 100644 --- a/src/storages/__init__.py +++ b/src/storages/__init__.py @@ -5,4 +5,5 @@ from .s3_storage import S3Config, S3Storage from .gd_storage import GDConfig, GDStorage from .storage import StorageV2 -from .s3 import S3StorageV2 \ No newline at end of file +from .s3 import S3StorageV2 +from .local import LocalStorageV2 \ No newline at end of file diff --git a/src/storages/local.py b/src/storages/local.py new file mode 100644 index 0000000..aafb28c --- /dev/null +++ b/src/storages/local.py @@ -0,0 +1,46 @@ + +import shutil +from typing import IO, Any +import boto3, uuid, os, mimetypes +from botocore.errorfactory import ClientError +from metadata import Metadata +from media import Media +from storages import StorageV2 +from loguru import logger +from slugify import slugify + + +class LocalStorageV2(StorageV2): + name = "local_storage" + + def __init__(self, config: dict) -> None: + super().__init__(config) + os.makedirs(self.save_to, exist_ok=True) + + @staticmethod + def configs() -> dict: + return { + "save_to": {"default": "./archived", "help": "folder where to save archived content"}, + "flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"}, + "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"}, + } + + def get_cdn_url(self, media: Media) -> str: + dest = os.path.join(self.save_to, media.key) + if self.save_absolute: + dest = os.path.abspath(dest) + return dest + + def upload(self, media: Media, **kwargs) -> bool: + # override parent so that we can use shutil.copy2 and keep metadata + if self.flatten: + dest = os.path.join(self.save_to, slugify(media.key)) + else: + dest = os.path.join(self.save_to, media.key) + + os.makedirs(dest, exist_ok=True) + logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}') + shutil.copy2(media.filename, dest) + return True + + def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass diff --git a/src/storages/s3.py b/src/storages/s3.py index d4457e8..acd907e 100644 --- a/src/storages/s3.py +++ b/src/storages/s3.py @@ -45,26 +45,27 @@ class S3StorageV2(StorageV2): def get_cdn_url(self, media: Media) -> str: return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key) - def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> Any: + def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None: extra_args = kwargs.get("extra_args", {}) if not self.private and 'ACL' not in extra_args: extra_args['ACL'] = 'public-read' if 'ContentType' not in extra_args: try: - extra_args['ContentType'] = mimetypes.guess_type(media.key)[0] + extra_args['ContentType'] = media.mimetype except Exception as e: logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}") self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args) + return True - def exists(self, key: str) -> bool: - """ - Tests if a given file with key=key exists in the bucket - """ - try: - self.s3.head_object(Bucket=self.bucket, Key=key) - return True - except ClientError as e: - logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}") - return False + # def exists(self, key: str) -> bool: + # """ + # Tests if a given file with key=key exists in the bucket + # """ + # try: + # self.s3.head_object(Bucket=self.bucket, Key=key) + # return True + # except ClientError as e: + # logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}") + # return False diff --git a/src/storages/storage.py b/src/storages/storage.py index 06346e9..61d4c77 100644 --- a/src/storages/storage.py +++ b/src/storages/storage.py @@ -7,6 +7,7 @@ from metadata import Metadata from steps.step import Step from loguru import logger import os, uuid +from slugify import slugify @dataclass @@ -21,23 +22,26 @@ class StorageV2(Step): def init(name: str, config: dict) -> StorageV2: return Step.init(name, config, StorageV2) - def store(self, media: Media, item: Metadata) -> Media: - media = self.set_key(media, item) + def store(self, media: Media, item: Metadata) -> None: + self.set_key(media, item) self.upload(media) - media.cdn_url = self.get_cdn_url(media) - return media + media.add_url(self.get_cdn_url(media)) @abstractmethod - def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> Any: pass + def get_cdn_url(self, media: Media) -> str: pass - def upload(self, media: Media, **kwargs) -> Any: - logger.debug(f'[{self.__class__.name}] uploading file {media.filename} with key {media.key}') + @abstractmethod + def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass + + def upload(self, media: Media, **kwargs) -> bool: + logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}') with open(media.filename, 'rb') as f: return self.uploadf(f, media, **kwargs) - def set_key(self, media: Media, item: Metadata) -> Media: + def set_key(self, media: Media, item: Metadata) -> None: """takes the media and optionally item info and generates a key""" + if media.key is not None and len(media.key) > 0: return folder = item.get("folder", "") ext = os.path.splitext(media.filename)[1] - media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}") - return media + # media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}") + media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}")