local storage + multiple storage support

2023-01-12 02:09:39 +00:00 · 2023-01-12 02:09:39 +00:00 · 6ca46417fe
commit 6ca46417fe
--- a/src/archivers/telethon_archiverv2.py
+++ b/src/archivers/telethon_archiverv2.py
@ -132,11 +132,11 @@ class TelethonArchiver(Archiverv2):
                if mp.entities:
                    other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
                    if len(other_media_urls):
-                        logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}")
+                        logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
                    for i, om_url in enumerate(other_media_urls):
                        filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
                        self.download_from_url(om_url, filename)
-                        result.add_media(Media(filename))
+                        result.add_media(Media(filename=filename, id=f"{group_id}_{i}"))
                filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
                filename = self.client.download_media(mp.media, filename_dest)
--- a/src/databases/gsheet_db.py
+++ b/src/databases/gsheet_db.py
@ -63,13 +63,13 @@ class GsheetsDb(Database):
        media: Media = item.get_single_media()
-        batch_if_valid('archive', media.cdn_url)
+        batch_if_valid('archive', "\n".join(media.urls))
        batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
        batch_if_valid('title', item.get_title())
        batch_if_valid('text', item.get("content", "")[:500])
        batch_if_valid('timestamp', item.get_timestamp())
        if (screenshot := item.get_media_by_id("screenshot")):
-            batch_if_valid('screenshot', screenshot.cdn_url)
+            batch_if_valid('screenshot', "\n".join(screenshot.urls))
        # batch_if_valid('status', item.status)
        # TODO: AFTER ENRICHMENTS
--- a/src/enrichers/wayback_enricher.py
+++ b/src/enrichers/wayback_enricher.py
@ -21,7 +21,7 @@ class WaybackEnricher(Enricher):
    @staticmethod
    def configs() -> dict:
        return {
-            "timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."},
+            "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
            "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
            "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
        }
--- a/src/formatters/templates/html_template.html
+++ b/src/formatters/templates/html_template.html
@ -26,6 +26,7 @@
        table,
        th,
        td {
            margin: auto;
            border: 1px solid;
            border-collapse: collapse;
        }
@ -43,18 +44,17 @@
 <body>
    <h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
-    <p>title: '<span>{{ title }}</span>'</p>
+    <p><b>title:</b> '<span>{{ title }}</span>'</p>
    <h2 class="center">content {{ media | length }} item(s)</h2>
    <table class="content">
        <tr>
            <th>about</th>
-            <th>preview</th>
+            <th>preview(s)</th>
        </tr>
        {% for m in media %}
        <tr>
            <td>
                <ul>
                    <li><a href="{{ m.cdn_url }}">ARCHIVE</a></li>
                    {% if m.hash | length > 1 %}
                    <li>hash: <span>{{ m.hash }}</span></li>
                    {% endif %}
@ -67,25 +67,28 @@
            </td>
            <td>
                {% for url in m.urls %}
                {% if 'image' in m.mimetype %}
-                <img src="{{ m.cdn_url }}" style="max-height:400px;max-width:400px;"></img>
+                <img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
                {% elif 'video' in m.mimetype %}
-                <video src="{{ m.cdn_url }}" controls style="max-height:400px;max-width:600px;">
+                <video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
                    Your browser does not support the video element.
                </video>
                {% elif 'audio' in m.mimetype %}
                <audio controls>
-                    <source src="{{ m.cdn_url }}" type="{{ m.mimetype }}">
+                    <source src="{{ url }}" type="{{ m.mimetype }}">
                    Your browser does not support the audio element.
                </audio>
                {% else %}
                No preview available, please open the link.
                {% endif %}
                <li><a href="{{ url }}">{{ url}}</a></li>
                {% endfor %}
            </td>
        </tr>
        {% endfor %}
    </table>
-    <h2>metadata</h2>
+    <h2 class="center">metadata</h2>
    <table class="metadata">
        <tr>
            <th>key</th>
@ -100,7 +103,7 @@
    </table>
    <hr>
-    <p>made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a>, add suggestions and report issues on the project's github page</p>
+    <p style="text-align:center;">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a></p>
 </body>
 </html>
--- a/src/media.py
+++ b/src/media.py
@ -2,7 +2,7 @@
 from __future__ import annotations
 from ast import List
 from typing import Any, Union, Dict
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import mimetypes
@ -10,15 +10,25 @@ import mimetypes
 class Media:
    filename: str
    key: str = None
-    cdn_url: str = None
+    urls: List[str] = field(default_factory=list)
-    mimetype: str = None  # eg: image/jpeg
+    _mimetype: str = None  # eg: image/jpeg
-    id: str = None # in case this type of media needs a special id, eg: screenshot
+    id: str = ""  # in case this type of media needs a special id, eg: screenshot
    # hash: str = None # TODO: added by enrichers
-    def set_mimetype(self) -> Media:
+    def add_url(self, url: str) -> None:
-        if not self.mimetype:
+        # url can be remote, local, ...
-            self.mimetype = mimetypes.guess_type(self.filename)[0]
+        self.urls.append(url)
-        return self
+
    @property  # getter .mimetype
    def mimetype(self) -> str:
        assert self.filename is not None and len(self.filename) > 0, "cannot get mimetype from media without filename"
        if not self._mimetype:
            self._mimetype = mimetypes.guess_type(self.filename)[0]
        return self._mimetype
    @mimetype.setter  # setter .mimetype
    def mimetype(self, v: str) -> None:
        self._mimetype = v
    def is_video(self) -> bool:
-        return self.mimetype.startswith("video")
+        return self._mimetype.startswith("video")
--- a/src/metadata.py
+++ b/src/metadata.py
@ -98,7 +98,6 @@ class Metadata:
    def add_media(self, media: Media) -> Metadata:
        if media is None: return
        media.set_mimetype()
        return self.media.append(media)
    def get_media_by_id(self, id:str) -> Media:
@ -110,7 +109,6 @@ class Metadata:
        if final:
            if self.final_media:
                logger.warning(f"overwriting final media value :{self.final_media} with {final}")
            final.set_mimetype()
            self.final_media = final
        return self
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@ -52,6 +52,7 @@ Cisticola considerations:
 2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
 """
 class ArchivingOrchestrator:
    def __init__(self, config) -> None:
        # in config.py we should test that the archivers exist and log mismatches (blocking execution)
@ -65,8 +66,8 @@ class ArchivingOrchestrator:
        #     Archiver.init(a, config)
        #     for a in config.archivers
        # ]
-        self.feeder : Feeder = config.feeder
+        self.feeder: Feeder = config.feeder
-        self.formatter : Formatter = config.formatter
+        self.formatter: Formatter = config.formatter
        self.enrichers = config.enrichers
        self.archivers: List[Archiverv2] = config.archivers
        self.databases: List[Database] = config.databases
@ -173,11 +174,9 @@ class ArchivingOrchestrator:
            e.enrich(result)
        # store media
        unstored_media = result.media[::]
        result.media = []
        for s in self.storages:
-            for m in unstored_media:
+            for m in result.media:
-                result.media.append(s.store(m, result))
+                s.store(m, result)  # modifies media
        # formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
        # TODO: should there only be 1 formatter?
@ -186,7 +185,8 @@ class ArchivingOrchestrator:
        # final format and store it
        if (final_media := self.formatter.format(result)):
            for s in self.storages:
-                result.set_final_media(s.store(final_media, result))
+                s.store(final_media, result)
            result.set_final_media(final_media)
        # signal completion to databases (DBs, Google Sheets, CSV, ...)
        # a hash registration service could be one database: forensic archiving
--- a/src/storages/init.py
+++ b/src/storages/init.py
@ -5,4 +5,5 @@ from .s3_storage import S3Config, S3Storage
 from .gd_storage import GDConfig, GDStorage
 from .storage import StorageV2
-from .s3 import S3StorageV2
+from .s3 import S3StorageV2
 from .local import LocalStorageV2
--- a/src/storages/local.py
+++ b/src/storages/local.py
@ -0,0 +1,46 @@
 import shutil
 from typing import IO, Any
 import boto3, uuid, os, mimetypes
 from botocore.errorfactory import ClientError
 from metadata import Metadata
 from media import Media
 from storages import StorageV2
 from loguru import logger
 from slugify import slugify
 class LocalStorageV2(StorageV2):
    name = "local_storage"
    def __init__(self, config: dict) -> None:
        super().__init__(config)
        os.makedirs(self.save_to, exist_ok=True)
    @staticmethod
    def configs() -> dict:
        return {
            "save_to": {"default": "./archived", "help": "folder where to save archived content"},
            "flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"},
            "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"},
        }
    def get_cdn_url(self, media: Media) -> str:
        dest = os.path.join(self.save_to, media.key)
        if self.save_absolute:
            dest = os.path.abspath(dest)
        return dest
    def upload(self, media: Media, **kwargs) -> bool:
        # override parent so that we can use shutil.copy2 and keep metadata
        if self.flatten:
            dest = os.path.join(self.save_to, slugify(media.key))
        else:
            dest = os.path.join(self.save_to, media.key)
        os.makedirs(dest, exist_ok=True)
        logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}')
        shutil.copy2(media.filename, dest)
        return True
    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
--- a/src/storages/s3.py
+++ b/src/storages/s3.py
@ -45,26 +45,27 @@ class S3StorageV2(StorageV2):
    def get_cdn_url(self, media: Media) -> str:
        return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
-    def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> Any:
+    def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
        extra_args = kwargs.get("extra_args", {})
        if not self.private and 'ACL' not in extra_args:
            extra_args['ACL'] = 'public-read'
        if 'ContentType' not in extra_args:
            try:
-                extra_args['ContentType'] = mimetypes.guess_type(media.key)[0]
+                extra_args['ContentType'] = media.mimetype
            except Exception as e:
                logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
        self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
        return True
-    def exists(self, key: str) -> bool:
+    # def exists(self, key: str) -> bool:
-        """
+    #     """
-        Tests if a given file with key=key exists in the bucket
+    #     Tests if a given file with key=key exists in the bucket
-        """
+    #     """
-        try:
+    #     try:
-            self.s3.head_object(Bucket=self.bucket, Key=key)
+    #         self.s3.head_object(Bucket=self.bucket, Key=key)
-            return True
+    #         return True
-        except ClientError as e:
+    #     except ClientError as e:
-            logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
+    #         logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
-        return False
+    #     return False
--- a/src/storages/storage.py
+++ b/src/storages/storage.py
@ -7,6 +7,7 @@ from metadata import Metadata
 from steps.step import Step
 from loguru import logger
 import os, uuid
 from slugify import slugify
@dataclass
@ -21,23 +22,26 @@ class StorageV2(Step):
    def init(name: str, config: dict) -> StorageV2:
        return Step.init(name, config, StorageV2)
-    def store(self, media: Media, item: Metadata) -> Media:
+    def store(self, media: Media, item: Metadata) -> None:
-        media = self.set_key(media, item)
+        self.set_key(media, item)
        self.upload(media)
-        media.cdn_url = self.get_cdn_url(media)
+        media.add_url(self.get_cdn_url(media))
        return media
    @abstractmethod
-    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> Any: pass
+    def get_cdn_url(self, media: Media) -> str: pass
-    def upload(self, media: Media, **kwargs) -> Any:
+    @abstractmethod
-        logger.debug(f'[{self.__class__.name}] uploading file {media.filename} with key {media.key}')
+    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
    def upload(self, media: Media, **kwargs) -> bool:
        logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
        with open(media.filename, 'rb') as f:
            return self.uploadf(f, media, **kwargs)
-    def set_key(self, media: Media, item: Metadata) -> Media:
+    def set_key(self, media: Media, item: Metadata) -> None:
        """takes the media and optionally item info and generates a key"""
        if media.key is not None and len(media.key) > 0: return
        folder = item.get("folder", "")
        ext = os.path.splitext(media.filename)[1]
-        media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
+        # media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
-        return media
+        media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}")