From e4603a942305bcd9ad772d14e21af7cb896ba759 Mon Sep 17 00:00:00 2001
From: msramalho <19508417+msramalho@users.noreply.github.com>
Date: Tue, 22 Feb 2022 16:03:35 +0100
Subject: [PATCH] refactoring storage and bringing changes from origin

---
 __init__.py                     |  1 +
 archivers/base_archiver.py      | 55 +++++++++----------------
 archivers/telegram_archiver.py  | 27 ++++--------
 archivers/tiktok_archiver.py    | 39 +++++++-----------
 archivers/wayback_archiver.py   | 27 ++++++------
 archivers/youtubedl_archiver.py | 41 +++++++++---------
 auto_archive.py                 | 73 ++++++++++++++++++++-------------
 storages/__init__.py            |  3 ++
 storages/base_storage.py        | 19 +++++++++
 storages/s3_storage.py          | 49 ++++++++++++++++++++++
 10 files changed, 197 insertions(+), 137 deletions(-)
 create mode 100644 __init__.py
 create mode 100644 storages/__init__.py
 create mode 100644 storages/base_storage.py
 create mode 100644 storages/s3_storage.py

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..b85e02a
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1 @@
+from storages import *
\ No newline at end of file
diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py
index 3f9f4ac..b13a77f 100644
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@@ -1,17 +1,10 @@
 import os
 import ffmpeg
-from dataclasses import dataclass
 import datetime
-from loguru import logger
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
 
-# TODO There should be a better way of generating keys, that adds the following info:
-#           - name of sheet that it is being archived from
-#             (this means we might archive the same media twice on different sheets, but that's OK I think)
-#           - name of archiver/platform that the video comes from
-#       This should make it easier to maintain and clean the archive later
-
-# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
-#      cleaned up? Difficult is we don't know the filename until the archivers start working.
+from storages import Storage
 
 
 @dataclass
@@ -25,33 +18,27 @@ class ArchiveResult:
     timestamp: datetime.datetime = None
 
 
-class Archiver:
+class Archiver(ABC):
     name = "default"
 
-    def __init__(self, s3_client):
-        self.s3 = s3_client
+    def __init__(self, storage: Storage):
+        self.storage = storage
 
     def __str__(self):
         return self.__class__.__name__
 
-    def download(self, url, check_if_exists=False):
-        logger.error("method 'download' not implemented")
-
-    def get_cdn_url(self, key):
-        return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
-            os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
-
-    def do_s3_upload(self, f, key):
-        self.s3.upload_fileobj(f, Bucket=os.getenv(
-            'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
+    @abstractmethod
+    def download(self, url, check_if_exists=False): pass
 
     def get_key(self, filename):
-        print(f"key base implementation: {self.name}")
-        # TODO: refactor to be more manageable
-        key = filename.split('/')[1]
-        if 'unknown_video' in key:
-            key = key.replace('unknown_video', 'jpg')
-        return key
+        """
+        returns a key in the format "[archiverName]_[filename]" includes extension
+        """
+        tail = os.path.split(filename)[1]  # returns filename.ext from full path
+        _id, extension = os.path.splitext(tail)  # returns [filename, .ext]
+        if 'unknown_video' in _id:
+            _id = _id.replace('unknown_video', 'jpg')
+        return f'{self.name}_{_id}{extension}'
 
     def get_thumbnails(self, filename, duration=None):
         if not os.path.exists(filename.split('.')[0]):
@@ -80,10 +67,9 @@ class Archiver:
                 thumbnail_filename = filename.split('.')[0] + '/' + fname
                 key = filename.split('/')[1].split('.')[0] + '/' + fname
 
-                cdn_url = self.get_cdn_url(key)
+                cdn_url = self.storage.get_cdn_url(key)
 
-                with open(thumbnail_filename, 'rb') as f:
-                    self.do_s3_upload(f, key)
+                self.storage.upload(thumbnail_filename, key)
 
                 cdn_urls.append(cdn_url)
                 os.remove(thumbnail_filename)
@@ -107,9 +93,8 @@ class Archiver:
 
         thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
 
-        self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
-            'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
+        self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
 
-        thumb_index_cdn_url = self.get_cdn_url(thumb_index)
+        thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
 
         return (key_thumb, thumb_index_cdn_url)
diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py
index d9168e4..16c6ccf 100644
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@@ -1,15 +1,13 @@
 import os
 import requests
 from bs4 import BeautifulSoup
-from botocore.errorfactory import ClientError
-from .base_archiver import Archiver, ArchiveResult
 
-# TODO: get_cdn_url, get_thumbnails, do_s3_upload
+from .base_archiver import Archiver, ArchiveResult
 
 
 class TelegramArchiver(Archiver):
     name = "telegram"
-    
+
     def download(self, url, check_if_exists=False):
         # detect URLs that we definitely cannot handle
         if 'http://t.me/' not in url and 'https://t.me/' not in url:
@@ -35,19 +33,13 @@ class TelegramArchiver(Archiver):
 
         video_url = video.get('src')
         key = video_url.split('/')[-1].split('?')[0]
+        key = self.get_key(key)
+
         filename = 'tmp/' + key
 
-        if check_if_exists:
-            try:
-                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                # file exists
-                cdn_url = self.get_cdn_url(key)
-
-                status = 'already archived'
-
-            except ClientError:
-                pass
+        if check_if_exists and self.storage.exists(key):
+            status = 'already archived'
+            cdn_url = self.storage.get_cdn_url(key)
 
         v = requests.get(video_url, headers=headers)
 
@@ -55,10 +47,9 @@ class TelegramArchiver(Archiver):
             f.write(v.content)
 
         if status != 'already archived':
-            cdn_url = self.get_cdn_url(key)
+            cdn_url = self.storage.get_cdn_url(key)
 
-            with open(filename, 'rb') as f:
-                self.do_s3_upload(f, key)
+            self.storage.upload(filename, key)
 
         # extract duration from HTML
         duration = s.find_all('time')[0].contents[0]
diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py
index 1e3bcaf..e61fec9 100644
--- a/archivers/tiktok_archiver.py
+++ b/archivers/tiktok_archiver.py
@@ -1,15 +1,13 @@
 import os, traceback
-from botocore.errorfactory import ClientError
 import tiktok_downloader
 from loguru import logger
-from .base_archiver import Archiver, ArchiveResult
 
-# TODO: get_cdn_url, do_s3_upload, get_thumbnails
+from .base_archiver import Archiver, ArchiveResult
 
 
 class TiktokArchiver(Archiver):
     name = "tiktok"
-    
+
     def download(self, url, check_if_exists=False):
         if 'tiktok.com' not in url:
             return False
@@ -18,35 +16,28 @@ class TiktokArchiver(Archiver):
 
         try:
             info = tiktok_downloader.info_post(url)
-            key = 'tiktok_' + str(info.id) + '.mp4'
+            key = self.get_key(f'{info.id}.mp4')
+            cdn_url = self.get_cdn_url(key)
             filename = 'tmp/' + key
 
-            if check_if_exists:
-                try:
-                    self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+            if check_if_exists and self.storage.exists(key):
+                status = 'already archived'
 
-                    # file exists
-                    cdn_url = self.get_cdn_url(key)
+            media = tiktok_downloader.snaptik(url).get_media()
 
-                    status = 'already archived'
+            if len(media) <= 0:
+                if status == 'already archived':
+                    return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
+                else:
+                    return ArchiveResult(status='Could not download media')
 
-                except ClientError:
-                    pass
+            media[0].download(filename)
 
             if status != 'already archived':
-                media = tiktok_downloader.snaptik(url).get_media()
-                if len(media) > 0:
-                    media[0].download(filename)
-                    with open(filename, 'rb') as f:
-                        self.do_s3_upload(f, key)
-
-                    cdn_url = self.get_cdn_url(key)
-                else:
-                    status = 'could not download media'
+                self.storage.upload(filename, key)
 
             try:
-                key_thumb, thumb_index = self.get_thumbnails(
-                    filename, duration=info.duration)
+                key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration)
             except:
                 key_thumb = ''
                 thumb_index = 'error creating thumbnails'
diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py
index a021324..53b356f 100644
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@@ -1,14 +1,15 @@
 import time, requests, os
 from bs4 import BeautifulSoup
 
+from storages import Storage
 from .base_archiver import Archiver, ArchiveResult
 
 
 class WaybackArchiver(Archiver):
     name = "wayback"
-    
-    def __init__(self, s3_client):
-        self.s3 = s3_client
+
+    def __init__(self, storage: Storage):
+        super(WaybackArchiver, self).__init__(storage)
         self.seen_urls = {}
 
     def download(self, url, check_if_exists=False):
@@ -26,10 +27,12 @@ class WaybackArchiver(Archiver):
         if r.status_code != 200:
             return ArchiveResult(status="Internet archive failed")
 
+        if 'job_id' not in r.json() and 'message' in r.json():
+            return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
+
         job_id = r.json()['job_id']
 
-        status_r = requests.get(
-            'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
+        status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers)
 
         retries = 0
 
@@ -51,7 +54,7 @@ class WaybackArchiver(Archiver):
         status_json = status_r.json()
 
         if status_json['status'] != 'success':
-            return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
+            return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
 
         archive_url = 'https://web.archive.org/web/' + \
             status_json['timestamp'] + '/' + status_json['original_url']
@@ -59,15 +62,15 @@ class WaybackArchiver(Archiver):
         try:
             r = requests.get(archive_url)
 
-            parsed = BeautifulSoup(
-                r.content, 'html.parser')
+            parsed = BeautifulSoup(r.content, 'html.parser')
 
-            title = parsed.find_all('title')[
-                0].text
+            title = parsed.find_all('title')[0].text
+
+            if title == 'Wayback Machine':
+                title = 'Could not get title'
         except:
             title = "Could not get title"
 
-        result = ArchiveResult(
-            status='Internet Archive fallback', cdn_url=archive_url, title=title)
+        result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title)
         self.seen_urls[url] = result
         return result
diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py
index 8249cfa..88f7970 100644
--- a/archivers/youtubedl_archiver.py
+++ b/archivers/youtubedl_archiver.py
@@ -3,12 +3,13 @@ import os
 import datetime
 import youtube_dl
 from loguru import logger
-from botocore.errorfactory import ClientError
+
 from .base_archiver import Archiver, ArchiveResult
 
+
 class YoutubeDLArchiver(Archiver):
     name = "yotube_dl"
-    
+
     def download(self, url, check_if_exists=False):
         ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
         if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
@@ -32,8 +33,11 @@ class YoutubeDLArchiver(Archiver):
         if check_if_exists:
             if 'entries' in info:
                 if len(info['entries']) > 1:
+                    logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
+                    return False
+                elif len(info['entries']) == 0:
                     logger.warning(
-                        'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+                        'YoutubeDLArchiver succeeded but did not find video')
                     return False
 
                 filename = ydl.prepare_filename(info['entries'][0])
@@ -42,20 +46,14 @@ class YoutubeDLArchiver(Archiver):
 
             key = self.get_key(filename)
 
-            try:
-                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                # file exists
-                cdn_url = self.get_cdn_url(key)
-
+            if self.storage.exists(key):
                 status = 'already archived'
-
-            except ClientError:
-                pass
+                cdn_url = self.storage.get_cdn_url(key)
 
         # sometimes this results in a different filename, so do this again
         info = ydl.extract_info(url, download=True)
 
+        # TODO: add support for multiple videos
         if 'entries' in info:
             if len(info['entries']) > 1:
                 logger.warning(
@@ -70,19 +68,24 @@ class YoutubeDLArchiver(Archiver):
             filename = filename.split('.')[0] + '.mkv'
 
         if status != 'already archived':
-            key = self. get_key(filename)
-            cdn_url = self.get_cdn_url(key)
+            key = self.get_key(filename)
+            cdn_url = self.storage.get_cdn_url(key)
 
-            with open(filename, 'rb') as f:
-                self.do_s3_upload(f, key)
+            self.storage.upload(filename, key)
 
         # get duration
         duration = info['duration'] if 'duration' in info else None
 
         # get thumbnails
-        key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+        try:
+            key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+        except:
+            key_thumb = ''
+            thumb_index = 'Could not generate thumbnails'
+
         os.remove(filename)
 
+        timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
+
         return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
-                             title=info['title'] if 'title' in info else None,
-                             timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
+                             title=info['title'] if 'title' in info else None, timestamp=timestamp)
diff --git a/auto_archive.py b/auto_archive.py
index c478463..36bbadb 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -2,12 +2,13 @@ import os
 import datetime
 import argparse
 import math
+import requests
 import gspread
-import boto3
 from loguru import logger
 from dotenv import load_dotenv
 
 import archivers
+from storages import S3Storage, S3Config
 
 load_dotenv()
 
@@ -41,7 +42,7 @@ def index_to_col(index):
         return alphabet[index]
 
 
-def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
+def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
     update = []
 
     if columns['status'] is not None:
@@ -103,19 +104,24 @@ def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
 def process_sheet(sheet):
     gc = gspread.service_account(filename='service_account.json')
     sh = gc.open(sheet)
-    n_worksheets = len(sh.worksheets())
 
-    s3_client = boto3.client('s3',
-                             region_name=os.getenv('DO_SPACES_REGION'),
-                             endpoint_url='https://{}.digitaloceanspaces.com'.format(
-                                 os.getenv('DO_SPACES_REGION')),
-                             aws_access_key_id=os.getenv('DO_SPACES_KEY'),
-                             aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
+    s3_config = S3Config(
+        bucket=os.getenv('DO_BUCKET'),
+        region=os.getenv('DO_SPACES_REGION'),
+        key=os.getenv('DO_SPACES_KEY'),
+        secret=os.getenv('DO_SPACES_SECRET')
+    )
+
+    # s3_client = boto3.client('s3',
+    #                          region_name=os.getenv('DO_SPACES_REGION'),
+    #                          endpoint_url='https://{}.digitaloceanspaces.com'.format(
+    #                              os.getenv('DO_SPACES_REGION')),
+    #                          aws_access_key_id=os.getenv('DO_SPACES_KEY'),
+    #                          aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
 
     # loop through worksheets to check
-    for ii in range(n_worksheets):
-        logger.info("Opening worksheet " + str(ii))
-        wks = sh.get_worksheet(ii)
+    for ii, wks in enumerate(sh.worksheets()):
+        logger.info(f'Opening worksheet {ii}: "{wks.title}"')
         values = wks.get_all_values()
 
         headers = [v.lower() for v in values[0]]
@@ -126,7 +132,7 @@ def process_sheet(sheet):
                 'source url')) if 'source url' in headers else None
 
         if columns['url'] is None:
-            logger.warning("No 'Media URL' column found, skipping")
+            logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
             continue
 
         url_index = col_to_index(columns['url'])
@@ -153,6 +159,9 @@ def process_sheet(sheet):
         columns['duration'] = index_to_col(headers.index(
             'duration')) if 'duration' in headers else None
 
+        # archives will be in a folder 'doc_name/worksheet_name'
+        s3_config.folder = f'{sheet}/{wks.title}/'
+        s3_client = S3Storage(s3_config)
 
         # order matters, first to succeed excludes remaining
         active_archivers = [
@@ -162,37 +171,43 @@ def process_sheet(sheet):
             archivers.WaybackArchiver(s3_client)
         ]
 
-
         # loop through rows in worksheet
-        for i in range(2, len(values)+1):
-            v = values[i-1]
+        for i in range(2, len(values) + 1):
+            v = values[i - 1]
+            url = v[url_index]
 
-            if v[url_index] != "" and v[col_to_index(columns['status'])] == "":
-                latest_val = wks.acell(
-                    columns['status'] + str(i)).value
+            if url != "" and v[col_to_index(columns['status'])] == "":
+                latest_val = wks.acell(columns['status'] + str(i)).value
 
                 # check so we don't step on each others' toes
                 if latest_val == '' or latest_val is None:
-                    wks.update(
-                        columns['status'] + str(i), 'Archive in progress')
+                    wks.update(columns['status'] + str(i), 'Archive in progress')
+
+                    # expand short URL links
+                    if 'https://t.co/' in url:
+                        r = requests.get(url)
+                        url = r.url
 
                     for archiver in active_archivers:
                         logger.debug(f"Trying {archiver} on row {i}")
-                        result = archiver.download(v[url_index], check_if_exists=True)
+
+                        result = archiver.download(url, check_if_exists=True)
+
                         if result:
-                            logger.info(f"{archiver} succeeded on row {i}")
+                            logger.success(f"{archiver} succeeded on row {i}")
                             break
 
                     if result:
                         update_sheet(wks, i, result, columns, v)
+                    else:
+                        wks.update(columns['status'] + str(i), 'failed: no archiver')
 
+                    # except:
+                    # if any unexpected errors occured, log these into the Google Sheet
+                    # t, value, traceback = sys.exc_info()
 
-                        # except:
-                            # if any unexpected errors occured, log these into the Google Sheet
-                            # t, value, traceback = sys.exc_info()
-
-                            # update_sheet(wks, i, str(
-                            #     value), {}, columns, v)
+                    # update_sheet(wks, i, str(
+                    #     value), {}, columns, v)
 
 
 def main():
diff --git a/storages/__init__.py b/storages/__init__.py
new file mode 100644
index 0000000..3054d36
--- /dev/null
+++ b/storages/__init__.py
@@ -0,0 +1,3 @@
+# we need to explicitly expose the available imports here
+from .base_storage import *
+from .s3_storage import *
\ No newline at end of file
diff --git a/storages/base_storage.py b/storages/base_storage.py
new file mode 100644
index 0000000..050a8eb
--- /dev/null
+++ b/storages/base_storage.py
@@ -0,0 +1,19 @@
+from abc import ABC, abstractmethod
+
+
+class Storage(ABC):
+    @abstractmethod
+    def __init__(self, config): pass
+
+    @abstractmethod
+    def get_cdn_url(self, path): pass
+
+    @abstractmethod
+    def exists(self, path): pass
+
+    @abstractmethod
+    def uploadf(self, file, key, **kwargs): pass
+
+    def upload(self, filename: str, key: str, **kwargs):
+        with open(filename, 'rb') as f:
+            self.uploadf(f, key, **kwargs)
diff --git a/storages/s3_storage.py b/storages/s3_storage.py
new file mode 100644
index 0000000..188db7e
--- /dev/null
+++ b/storages/s3_storage.py
@@ -0,0 +1,49 @@
+import boto3
+from botocore.errorfactory import ClientError
+from .base_storage import Storage
+from dataclasses import dataclass
+
+
+@dataclass
+class S3Config:
+    bucket: str
+    region: str
+    key: str
+    secret: str
+    folder: str = ""
+
+
+class S3Storage(Storage):
+
+    def __init__(self, config: S3Config):
+        self.bucket = config.bucket
+        self.region = config.region
+        self.folder = config.folder
+
+        if len(self.folder) and self.folder[-1] != '/':
+            self.folder += '/'
+
+        self.s3 = boto3.client(
+            's3',
+            region_name=self.region,
+            endpoint_url=f'https://{self.region}.digitaloceanspaces.com',
+            aws_access_key_id=config.key,
+            aws_secret_access_key=config.secret
+        )
+
+    def _get_path(self, key):
+        return self.folder + key
+
+    def get_cdn_url(self, key):
+        return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
+
+    def exists(self, key):
+        try:
+            self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
+            return True
+        except ClientError:
+            return False
+
+    def uploadf(self, file, key, **kwargs):
+        extra_args = kwargs["extra_args"] if "extra_args" in kwargs else {'ACL': 'public-read'}
+        self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)