From 6ca46417feeda7f6ac586214cbf40917f9d9b50f Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Thu, 12 Jan 2023 02:09:39 +0000
Subject: [PATCH] local storage + multiple storage support

---
 src/archivers/telethon_archiverv2.py        |  4 +-
 src/databases/gsheet_db.py                  |  4 +-
 src/enrichers/wayback_enricher.py           |  2 +-
 src/formatters/templates/html_template.html | 19 +++++----
 src/media.py                                | 28 +++++++++----
 src/metadata.py                             |  2 -
 src/orchestrator.py                         | 14 +++----
 src/storages/__init__.py                    |  3 +-
 src/storages/local.py                       | 46 +++++++++++++++++++++
 src/storages/s3.py                          | 25 +++++------
 src/storages/storage.py                     | 24 ++++++-----
 11 files changed, 117 insertions(+), 54 deletions(-)
 create mode 100644 src/storages/local.py

diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py
index 66ecd74..6851cb5 100644
--- a/src/archivers/telethon_archiverv2.py
+++ b/src/archivers/telethon_archiverv2.py
@@ -132,11 +132,11 @@ class TelethonArchiver(Archiverv2):
                 if mp.entities:
                     other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image", "audio"]]
                     if len(other_media_urls):
-                        logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}")
+                        logger.debug(f"Got {len(other_media_urls)} other media urls from {mp.id=}: {other_media_urls}")
                     for i, om_url in enumerate(other_media_urls):
                         filename = os.path.join(tmp_dir, f'{chat}_{group_id}_{i}')
                         self.download_from_url(om_url, filename)
-                        result.add_media(Media(filename))
+                        result.add_media(Media(filename=filename, id=f"{group_id}_{i}"))
 
                 filename_dest = os.path.join(tmp_dir, f'{chat}_{group_id}', str(mp.id))
                 filename = self.client.download_media(mp.media, filename_dest)
diff --git a/src/databases/gsheet_db.py b/src/databases/gsheet_db.py
index 26aae68..0cf65ed 100644
--- a/src/databases/gsheet_db.py
+++ b/src/databases/gsheet_db.py
@@ -63,13 +63,13 @@ class GsheetsDb(Database):
 
         media: Media = item.get_single_media()
 
-        batch_if_valid('archive', media.cdn_url)
+        batch_if_valid('archive', "\n".join(media.urls))
         batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
         batch_if_valid('title', item.get_title())
         batch_if_valid('text', item.get("content", "")[:500])
         batch_if_valid('timestamp', item.get_timestamp())
         if (screenshot := item.get_media_by_id("screenshot")):
-            batch_if_valid('screenshot', screenshot.cdn_url)
+            batch_if_valid('screenshot', "\n".join(screenshot.urls))
         # batch_if_valid('status', item.status)
 
         # TODO: AFTER ENRICHMENTS
diff --git a/src/enrichers/wayback_enricher.py b/src/enrichers/wayback_enricher.py
index 09a43e0..bf55923 100644
--- a/src/enrichers/wayback_enricher.py
+++ b/src/enrichers/wayback_enricher.py
@@ -21,7 +21,7 @@ class WaybackEnricher(Enricher):
     @staticmethod
     def configs() -> dict:
         return {
-            "timeout": {"default": 5, "help": "number of seconds to wait for a response from webarchive's wayback machine, after that only job_id is saved but page will still be processed."},
+            "timeout": {"default": 15, "help": "seconds to wait for successful archive confirmation from wayback, if more than this passes the result contains the job_id so the status can later be checked manually."},
             "key": {"default": None, "help": "wayback API key. to get credentials visit https://archive.org/account/s3.php"},
             "secret": {"default": None, "help": "wayback API secret. to get credentials visit https://archive.org/account/s3.php"}
         }
diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html
index fc986f0..f488a5f 100644
--- a/src/formatters/templates/html_template.html
+++ b/src/formatters/templates/html_template.html
@@ -26,6 +26,7 @@
         table,
         th,
         td {
+            margin: auto;
             border: 1px solid;
             border-collapse: collapse;
         }
@@ -43,18 +44,17 @@
 
 <body>
     <h2>Archived media for <a href="{{ url }}">{{ url }}</a></h2>
-    <p>title: '<span>{{ title }}</span>'</p>
+    <p><b>title:</b> '<span>{{ title }}</span>'</p>
     <h2 class="center">content {{ media | length }} item(s)</h2>
     <table class="content">
         <tr>
             <th>about</th>
-            <th>preview</th>
+            <th>preview(s)</th>
         </tr>
         {% for m in media %}
         <tr>
             <td>
                 <ul>
-                    <li><a href="{{ m.cdn_url }}">ARCHIVE</a></li>
                     {% if m.hash | length > 1 %}
                     <li>hash: <span>{{ m.hash }}</span></li>
                     {% endif %}
@@ -67,25 +67,28 @@
 
             </td>
             <td>
+                {% for url in m.urls %}
                 {% if 'image' in m.mimetype %}
-                <img src="{{ m.cdn_url }}" style="max-height:400px;max-width:400px;"></img>
+                <img src="{{ url }}" style="max-height:400px;max-width:400px;"></img>
                 {% elif 'video' in m.mimetype %}
-                <video src="{{ m.cdn_url }}" controls style="max-height:400px;max-width:600px;">
+                <video src="{{ url }}" controls style="max-height:400px;max-width:600px;">
                     Your browser does not support the video element.
                 </video>
                 {% elif 'audio' in m.mimetype %}
                 <audio controls>
-                    <source src="{{ m.cdn_url }}" type="{{ m.mimetype }}">
+                    <source src="{{ url }}" type="{{ m.mimetype }}">
                     Your browser does not support the audio element.
                 </audio>
                 {% else %}
                 No preview available, please open the link.
                 {% endif %}
+                <li><a href="{{ url }}">{{ url}}</a></li>
+                {% endfor %}
             </td>
         </tr>
         {% endfor %}
     </table>
-    <h2>metadata</h2>
+    <h2 class="center">metadata</h2>
     <table class="metadata">
         <tr>
             <th>key</th>
@@ -100,7 +103,7 @@
     </table>
 
     <hr>
-    <p>made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a>, add suggestions and report issues on the project's github page</p>
+    <p style="text-align:center;">Made with <a href="https://github.com/bellingcat/auto-archiver">bellingcat/auto-archiver</a></p>
 </body>
 
 </html>
\ No newline at end of file
diff --git a/src/media.py b/src/media.py
index 3c416be..e50cc14 100644
--- a/src/media.py
+++ b/src/media.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 from ast import List
 from typing import Any, Union, Dict
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import mimetypes
 
 
@@ -10,15 +10,25 @@ import mimetypes
 class Media:
     filename: str
     key: str = None
-    cdn_url: str = None
-    mimetype: str = None  # eg: image/jpeg
-    id: str = None # in case this type of media needs a special id, eg: screenshot
+    urls: List[str] = field(default_factory=list)
+    _mimetype: str = None  # eg: image/jpeg
+    id: str = ""  # in case this type of media needs a special id, eg: screenshot
     # hash: str = None # TODO: added by enrichers
 
-    def set_mimetype(self) -> Media:
-        if not self.mimetype:
-            self.mimetype = mimetypes.guess_type(self.filename)[0]
-        return self
+    def add_url(self, url: str) -> None:
+        # url can be remote, local, ...
+        self.urls.append(url)
+
+    @property  # getter .mimetype
+    def mimetype(self) -> str:
+        assert self.filename is not None and len(self.filename) > 0, "cannot get mimetype from media without filename"
+        if not self._mimetype:
+            self._mimetype = mimetypes.guess_type(self.filename)[0]
+        return self._mimetype
+
+    @mimetype.setter  # setter .mimetype
+    def mimetype(self, v: str) -> None:
+        self._mimetype = v
 
     def is_video(self) -> bool:
-        return self.mimetype.startswith("video")
+        return self._mimetype.startswith("video")
diff --git a/src/metadata.py b/src/metadata.py
index 7af923c..7f57c3b 100644
--- a/src/metadata.py
+++ b/src/metadata.py
@@ -98,7 +98,6 @@ class Metadata:
 
     def add_media(self, media: Media) -> Metadata:
         if media is None: return
-        media.set_mimetype()
         return self.media.append(media)
 
     def get_media_by_id(self, id:str) -> Media:
@@ -110,7 +109,6 @@ class Metadata:
         if final:
             if self.final_media:
                 logger.warning(f"overwriting final media value :{self.final_media} with {final}")
-            final.set_mimetype()
             self.final_media = final
         return self
 
diff --git a/src/orchestrator.py b/src/orchestrator.py
index 3d554e0..612ea2b 100644
--- a/src/orchestrator.py
+++ b/src/orchestrator.py
@@ -52,6 +52,7 @@ Cisticola considerations:
 2. So the auto-archiver becomes like a puzzle and fixes to Cisticola scrapers can immediately benefit it, and contributions are focused on a single source or scraping
 """
 
+
 class ArchivingOrchestrator:
     def __init__(self, config) -> None:
         # in config.py we should test that the archivers exist and log mismatches (blocking execution)
@@ -65,8 +66,8 @@ class ArchivingOrchestrator:
         #     Archiver.init(a, config)
         #     for a in config.archivers
         # ]
-        self.feeder : Feeder = config.feeder
-        self.formatter : Formatter = config.formatter
+        self.feeder: Feeder = config.feeder
+        self.formatter: Formatter = config.formatter
         self.enrichers = config.enrichers
         self.archivers: List[Archiverv2] = config.archivers
         self.databases: List[Database] = config.databases
@@ -173,11 +174,9 @@ class ArchivingOrchestrator:
             e.enrich(result)
 
         # store media
-        unstored_media = result.media[::]
-        result.media = []
         for s in self.storages:
-            for m in unstored_media:
-                result.media.append(s.store(m, result))
+            for m in result.media:
+                s.store(m, result)  # modifies media
 
         # formatters, enrichers, and storages will sometimes look for specific properties: eg <li>Screenshot: <img src="{res.get("screenshot")}"> </li>
         # TODO: should there only be 1 formatter?
@@ -186,7 +185,8 @@ class ArchivingOrchestrator:
         # final format and store it
         if (final_media := self.formatter.format(result)):
             for s in self.storages:
-                result.set_final_media(s.store(final_media, result))
+                s.store(final_media, result)
+            result.set_final_media(final_media)
 
         # signal completion to databases (DBs, Google Sheets, CSV, ...)
         # a hash registration service could be one database: forensic archiving
diff --git a/src/storages/__init__.py b/src/storages/__init__.py
index 91ce148..4c0783c 100644
--- a/src/storages/__init__.py
+++ b/src/storages/__init__.py
@@ -5,4 +5,5 @@ from .s3_storage import S3Config, S3Storage
 from .gd_storage import GDConfig, GDStorage
 
 from .storage import StorageV2
-from .s3 import S3StorageV2
\ No newline at end of file
+from .s3 import S3StorageV2
+from .local import LocalStorageV2
\ No newline at end of file
diff --git a/src/storages/local.py b/src/storages/local.py
new file mode 100644
index 0000000..aafb28c
--- /dev/null
+++ b/src/storages/local.py
@@ -0,0 +1,46 @@
+
+import shutil
+from typing import IO, Any
+import boto3, uuid, os, mimetypes
+from botocore.errorfactory import ClientError
+from metadata import Metadata
+from media import Media
+from storages import StorageV2
+from loguru import logger
+from slugify import slugify
+
+
+class LocalStorageV2(StorageV2):
+    name = "local_storage"
+
+    def __init__(self, config: dict) -> None:
+        super().__init__(config)
+        os.makedirs(self.save_to, exist_ok=True)
+
+    @staticmethod
+    def configs() -> dict:
+        return {
+            "save_to": {"default": "./archived", "help": "folder where to save archived content"},
+            "flatten": {"default": True, "help": "if true saves all files to the root of 'save_to' directory, if false preserves subdir structure"},
+            "save_absolute": {"default": False, "help": "whether the path to the stored file is absolute or relative (leaks the file structure)"},
+        }
+
+    def get_cdn_url(self, media: Media) -> str:
+        dest = os.path.join(self.save_to, media.key)
+        if self.save_absolute:
+            dest = os.path.abspath(dest)
+        return dest
+
+    def upload(self, media: Media, **kwargs) -> bool:
+        # override parent so that we can use shutil.copy2 and keep metadata
+        if self.flatten:
+            dest = os.path.join(self.save_to, slugify(media.key))
+        else:
+            dest = os.path.join(self.save_to, media.key)
+
+        os.makedirs(dest, exist_ok=True)
+        logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key} to {dest}')
+        shutil.copy2(media.filename, dest)
+        return True
+
+    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
diff --git a/src/storages/s3.py b/src/storages/s3.py
index d4457e8..acd907e 100644
--- a/src/storages/s3.py
+++ b/src/storages/s3.py
@@ -45,26 +45,27 @@ class S3StorageV2(StorageV2):
     def get_cdn_url(self, media: Media) -> str:
         return self.cdn_url.format(bucket=self.bucket, region=self.region, key=media.key)
 
-    def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> Any:
+    def uploadf(self, file: IO[bytes], media: Media, **kwargs: dict) -> None:
         extra_args = kwargs.get("extra_args", {})
         if not self.private and 'ACL' not in extra_args:
             extra_args['ACL'] = 'public-read'
 
         if 'ContentType' not in extra_args:
             try:
-                extra_args['ContentType'] = mimetypes.guess_type(media.key)[0]
+                extra_args['ContentType'] = media.mimetype
             except Exception as e:
                 logger.warning(f"Unable to get mimetype for {media.key=}, error: {e}")
 
         self.s3.upload_fileobj(file, Bucket=self.bucket, Key=media.key, ExtraArgs=extra_args)
+        return True
 
-    def exists(self, key: str) -> bool:
-        """
-        Tests if a given file with key=key exists in the bucket
-        """
-        try:
-            self.s3.head_object(Bucket=self.bucket, Key=key)
-            return True
-        except ClientError as e:
-            logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
-        return False
+    # def exists(self, key: str) -> bool:
+    #     """
+    #     Tests if a given file with key=key exists in the bucket
+    #     """
+    #     try:
+    #         self.s3.head_object(Bucket=self.bucket, Key=key)
+    #         return True
+    #     except ClientError as e:
+    #         logger.warning(f"got a ClientError when checking if {key=} exists in bucket={self.bucket}: {e}")
+    #     return False
diff --git a/src/storages/storage.py b/src/storages/storage.py
index 06346e9..61d4c77 100644
--- a/src/storages/storage.py
+++ b/src/storages/storage.py
@@ -7,6 +7,7 @@ from metadata import Metadata
 from steps.step import Step
 from loguru import logger
 import os, uuid
+from slugify import slugify
 
 
 @dataclass
@@ -21,23 +22,26 @@ class StorageV2(Step):
     def init(name: str, config: dict) -> StorageV2:
         return Step.init(name, config, StorageV2)
 
-    def store(self, media: Media, item: Metadata) -> Media:
-        media = self.set_key(media, item)
+    def store(self, media: Media, item: Metadata) -> None:
+        self.set_key(media, item)
         self.upload(media)
-        media.cdn_url = self.get_cdn_url(media)
-        return media
+        media.add_url(self.get_cdn_url(media))
 
     @abstractmethod
-    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> Any: pass
+    def get_cdn_url(self, media: Media) -> str: pass
 
-    def upload(self, media: Media, **kwargs) -> Any:
-        logger.debug(f'[{self.__class__.name}] uploading file {media.filename} with key {media.key}')
+    @abstractmethod
+    def uploadf(self, file: IO[bytes], key: str, **kwargs: dict) -> bool: pass
+
+    def upload(self, media: Media, **kwargs) -> bool:
+        logger.debug(f'[{self.__class__.name}] storing file {media.filename} with key {media.key}')
         with open(media.filename, 'rb') as f:
             return self.uploadf(f, media, **kwargs)
 
-    def set_key(self, media: Media, item: Metadata) -> Media:
+    def set_key(self, media: Media, item: Metadata) -> None:
         """takes the media and optionally item info and generates a key"""
+        if media.key is not None and len(media.key) > 0: return
         folder = item.get("folder", "")
         ext = os.path.splitext(media.filename)[1]
-        media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
-        return media
+        # media.key = os.path.join(folder, f"{str(uuid.uuid4())}{ext}")
+        media.key = os.path.join(folder, slugify(item.get_url()), f"{str(uuid.uuid4())}{ext}")