local storage + multiple storage support

2023-01-12 02:09:39 +00:00 · 2023-01-12 02:09:39 +00:00 · 6ca46417fe
commit 6ca46417fe
--- a/src/archivers/telethon_archiverv2.py
+++ b/src/archivers/telethon_archiverv2.py
@ -132,11 +132,11 @@ class TelethonArchiver(Archiverv2):
                if mp.entities:
                    other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
                    if len(other_media_urls):
-                        logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}")
+                        logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
                    for i, om_url in enumerate(other_media_urls):
                        filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
                        self.download_from_url(om_url, filename)
-                        result.add_media(Media(filename))
+                        result.add_media(Media(filename=filename, id=f"{group_id}_{i}"))

                filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
                filename = self.client.download_media(mp.media, filename_dest)
--- a/src/databases/gsheet_db.py
+++ b/src/databases/gsheet_db.py
@ -63,13 +63,13 @@ class GsheetsDb(Database):

        media: Media = item.get_single_media()

-        batch_if_valid('archive', media.cdn_url)
+        batch_if_valid('archive', "\n".join(media.urls))
        batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
        batch_if_valid('title', item.get_title())
        batch_if_valid('text', item.get("content", "")[:500])
        batch_if_valid('timestamp', item.get_timestamp())
        if (screenshot := item.get_media_by_id("screenshot")):
-            batch_if_valid('screenshot', screenshot.cdn_url)
+            batch_if_valid('screenshot', "\n".join(screenshot.urls))
        # batch_if_valid('status', item.status)

        # TODO: AFTER ENRICHMENTS
--- a/src/enrichers/wayback_enricher.py
+++ b/src/enrichers/wayback_enricher.py
@ -21,7 +21,7 @@ class WaybackEnricher(Enricher):
    @staticmethod
    def configs() -> dict:
        return {
-            "timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."},
+            "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
            "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
            "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
        }
--- a/src/formatters/templates/html_template.html
+++ b/src/formatters/templates/html_template.html
@ -26,6 +26,7 @@
        table,
        th,
        td {
+            margin: auto;
            border: 1px solid;
            border-collapse: collapse;
        }
@ -43,18 +44,17 @@

 <body>
    <h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
-    <p>title: '<span>{{ title }}</span>'</p>
+    <p><b>title:</b> '<span>{{ title }}</span>'</p>
    <h2 class="center">content {{ media | length }} item(s)</h2>
    <table class="content">
        <tr>
            <th>about</th>
-            <th>preview</th>
+            <th>preview(s)</th>
        </tr>
        {% for m in media %}
        <tr>
            <td>
                <ul>
-                    <li><a href="{{ m.cdn_url }}">ARCHIVE</a></li>
                    {% if m.hash | length > 1 %}
                    <li>hash: <span>{{ m.hash }}</span></li>
                    {% endif %}
@ -67,25 +67,28 @@

            </td>
            <td>
+                {% for url in m.urls %}
                {% if 'image' in m.mimetype %}
-                <img src="{{ m.cdn_url }}" style="max-height:400px;max-width:400px;"></img>
+                <img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
                {% elif 'video' in m.mimetype %}
-                <video src="{{ m.cdn_url }}" controls style="max-height:400px;max-width:600px;">
+                <video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
                    Your browser does not support the video element.
                </video>
                {% elif 'audio' in m.mimetype %}
                <audio controls>
-                    <source src="{{ m.cdn_url }}" type="{{ m.mimetype }}">
+                    <source src="{{ url }}" type="{{ m.mimetype }}">
                    Your browser does not support the audio element.
                </audio>
                {% else %}
                No preview available, please open the link.
                {% endif %}
+                <li><a href="{{ url }}">{{ url}}</a></li>
+                {% endfor %}
            </td>
        </tr>
        {% endfor %}
    </table>
-    <h2>metadata</h2>
+    <h2 class="center">metadata</h2>
    <table class="metadata">
        <tr>
            <th>key</th>
@ -100,7 +103,7 @@
    </table>

    <hr>
-    <p>made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a>, add suggestions and report issues on the project's github page</p>
+    <p style="text-align:center;">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a></p>
 </body>

 </html>
--- a/src/media.py
+++ b/src/media.py
@ -2,7 +2,7 @@
 from __future__ import annotations
 from ast import List
 from typing import Any, Union, Dict
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import mimetypes


@ -10,15 +10,25 @@ import mimetypes
 class Media:
    filename: str
    key: str = None
-    cdn_url: str = None
-    mimetype: str = None  # eg: image/jpeg
-    id: str = None # in case this type of media needs a special id, eg: screenshot
+    urls: List[str] = field(default_factory=list)
+    _mimetype: str = None  # eg: image/jpeg
+    id: str = ""  # in case this type of media needs a special id, eg: screenshot
    # hash: str = None # TODO: added by enrichers

-    def set_mimetype(self) -> Media:
-        if not self.mimetype:
-            self.mimetype = mimetypes.guess_type(self.filename)[0]
-        return self
+    def add_url(self, url: str) -> None:
+        # url can be remote, local, ...
+        self.urls.append(url)
+
+    @property  # getter .mimetype
+    def mimetype(self) -> str:
+        assert self.filename is not None and len(self.filename) > 0, "cannot get mimetype from media without filename"
+        if not self._mimetype:
+            self._mimetype = mimetypes.guess_type(self.filename)[0]
+        return self._mimetype
+
+    @mimetype.setter  # setter .mimetype
+    def mimetype(self, v: str) -> None:
+        self._mimetype = v

    def is_video(self) -> bool:
-        return self.mimetype.startswith("video")
+        return self._mimetype.startswith("video")
--- a/src/metadata.py
+++ b/src/metadata.py
@ -98,7 +98,6 @@ class Metadata:

    def add_media(self, media: Media) -> Metadata:
        if media is None: return
-        media.set_mimetype()
        return self.media.append(media)

    def get_media_by_id(self, id:str) -> Media:
@ -110,7 +109,6 @@ class Metadata:
        if final:
            if self.final_media:
                logger.warning(f"overwriting final media value :{self.final_media} with {final}")
-            final.set_mimetype()
            self.final_media = final
        return self

--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@ -52,6 +52,7 @@ Cisticola considerations:
 2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
 """

+
 class ArchivingOrchestrator:
    def __init__(self, config) -> None:
        # in config.py we should test that the archivers exist and log mismatches (blocking execution)
@ -65,8 +66,8 @@ class ArchivingOrchestrator:
        #     Archiver.init(a, config)
        #     for a in config.archivers
        # ]
-        self.feeder : Feeder = config.feeder
-        self.formatter : Formatter = config.formatter
+        self.feeder: Feeder = config.feeder
+        self.formatter: Formatter = config.formatter
        self.enrichers = config.enrichers
        self.archivers: List[Archiverv2] = config.archivers
        self.databases: List[Database] = config.databases
@ -173,11 +174,9 @@ class ArchivingOrchestrator:
            e.enrich(result)

        # store media
-        unstored_media = result.media[::]
-        result.media = []
        for s in self.storages:
-            for m in unstored_media:
-                result.media.append(s.store(m, result))
+            for m in result.media:
+                s.store(m, result)  # modifies media

        # formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
        # TODO: should there only be 1 formatter?
@ -186,7 +185,8 @@ class ArchivingOrchestrator:
        # final format and store it
        if (final_media := self.formatter.format(result)):
            for s in self.storages:
-                result.set_final_media(s.store(final_media, result))
+                s.store(final_media, result)
+            result.set_final_media(final_media)

        # signal completion to databases (DBs, Google Sheets, CSV, ...)
        # a hash registration service could be one database: forensic archiving
--- a/src/storages/init.py
+++ b/src/storages/init.py
@ -5,4 +5,5 @@ from .s3_storage import S3Config, S3Storage
 from .gd_storage import GDConfig, GDStorage

 from .storage import StorageV2
-from .s3 import S3StorageV2
+from .s3 import S3StorageV2
+from .local import LocalStorageV2
--- a/src/storages/local.py
+++ b/src/storages/local.py
@ -0,0 +1,46 @@
+
+import shutil
+from typing import IO, Any
+import boto3, uuid, os, mimetypes
+from botocore.errorfactory import ClientError
+from metadata import Metadata
+from media import Media
+from storages import StorageV2
+from loguru import logger
+from slugify import slugify
+
+
+class LocalStorageV2(StorageV2):
+    name = "local_storage"
+
+    def __init__(self, config: dict) -> None:
+        super().__init__(config)
+        os.makedirs(self.save_to, exist_ok=True)
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "save_to": {"default": "./archived", "help": "folder where to save archived content"},
+            "flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"},
+            "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"},
+        }
+
+    def get_cdn_url(self, media: Media) -> str:
+        dest = os.path.join(self.save_to, media.key)
+        if self.save_absolute:
+            dest = os.path.abspath(dest)
+        return dest
+
+    def upload(self, media: Media, **kwargs) -> bool:
+        # override parent so that we can use shutil.copy2 and keep metadata
+        if self.flatten:
+            dest = os.path.join(self.save_to, slugify(media.key))
+        else:
+            dest = os.path.join(self.save_to, media.key)
+
+        os.makedirs(dest, exist_ok=True)
+        logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}')
+        shutil.copy2(media.filename, dest)
+        return True
+
+    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
--- a/src/storages/s3.py
+++ b/src/storages/s3.py
@ -45,26 +45,27 @@ class S3StorageV2(StorageV2):
    def get_cdn_url(self, media: Media) -> str:
        return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)

-    def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> Any:
+    def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
        extra_args = kwargs.get("extra_args", {})
        if not self.private and 'ACL' not in extra_args:
            extra_args['ACL'] = 'public-read'

        if 'ContentType' not in extra_args:
            try:
-                extra_args['ContentType'] = mimetypes.guess_type(media.key)[0]
+                extra_args['ContentType'] = media.mimetype
            except Exception as e:
                logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")

        self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
+        return True

-    def exists(self, key: str) -> bool:
-        """
-        Tests if a given file with key=key exists in the bucket
-        """
-        try:
-            self.s3.head_object(Bucket=self.bucket, Key=key)
-            return True
-        except ClientError as e:
-            logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
-        return False
+    # def exists(self, key: str) -> bool:
+    #     """
+    #     Tests if a given file with key=key exists in the bucket
+    #     """
+    #     try:
+    #         self.s3.head_object(Bucket=self.bucket, Key=key)
+    #         return True
+    #     except ClientError as e:
+    #         logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
+    #     return False
--- a/src/storages/storage.py
+++ b/src/storages/storage.py
@ -7,6 +7,7 @@ from metadata import Metadata
 from steps.step import Step
 from loguru import logger
 import os, uuid
+from slugify import slugify


@dataclass
@ -21,23 +22,26 @@ class StorageV2(Step):
    def init(name: str, config: dict) -> StorageV2:
        return Step.init(name, config, StorageV2)

-    def store(self, media: Media, item: Metadata) -> Media:
-        media = self.set_key(media, item)
+    def store(self, media: Media, item: Metadata) -> None:
+        self.set_key(media, item)
        self.upload(media)
-        media.cdn_url = self.get_cdn_url(media)
-        return media
+        media.add_url(self.get_cdn_url(media))

    @abstractmethod
-    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> Any: pass
+    def get_cdn_url(self, media: Media) -> str: pass

-    def upload(self, media: Media, **kwargs) -> Any:
-        logger.debug(f'[{self.__class__.name}] uploading file {media.filename} with key {media.key}')
+    @abstractmethod
+    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
+
+    def upload(self, media: Media, **kwargs) -> bool:
+        logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
        with open(media.filename, 'rb') as f:
            return self.uploadf(f, media, **kwargs)

-    def set_key(self, media: Media, item: Metadata) -> Media:
+    def set_key(self, media: Media, item: Metadata) -> None:
        """takes the media and optionally item info and generates a key"""
+        if media.key is not None and len(media.key) > 0: return
        folder = item.get("folder", "")
        ext = os.path.splitext(media.filename)[1]
-        media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
-        return media
+        # media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
+        media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}")