refactoring storage and bringing changes from origin

2022-02-22 16:03:35 +01:00 · 2022-02-22 16:03:35 +01:00 · e4603a9423
commit e4603a9423
--- a/init.py
+++ b/init.py
@ -0,0 +1 @@
+from storages import *
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@ -1,17 +1,10 @@
 import os
 import ffmpeg
-from dataclasses import dataclass
 import datetime
-from loguru import logger
+from dataclasses import dataclass
+from abc import ABC, abstractmethod

-# TODO There should be a better way of generating keys, that adds the following info:
-#           - name of sheet that it is being archived from
-#             (this means we might archive the same media twice on different sheets, but that's OK I think)
-#           - name of archiver/platform that the video comes from
-#       This should make it easier to maintain and clean the archive later
-
-# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
-#      cleaned up? Difficult is we don't know the filename until the archivers start working.
+from storages import Storage


@dataclass
@ -25,33 +18,27 @@ class ArchiveResult:
    timestamp: datetime.datetime = None


-class Archiver:
+class Archiver(ABC):
    name = "default"

-    def __init__(self, s3_client):
-        self.s3 = s3_client
+    def __init__(self, storage: Storage):
+        self.storage = storage

    def __str__(self):
        return self.__class__.__name__

-    def download(self, url, check_if_exists=False):
-        logger.error("method 'download' not implemented")
-
-    def get_cdn_url(self, key):
-        return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
-            os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
-
-    def do_s3_upload(self, f, key):
-        self.s3.upload_fileobj(f, Bucket=os.getenv(
-            'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
+    @abstractmethod
+    def download(self, url, check_if_exists=False): pass

    def get_key(self, filename):
-        print(f"key base implementation: {self.name}")
-        # TODO: refactor to be more manageable
-        key = filename.split('/')[1]
-        if 'unknown_video' in key:
-            key = key.replace('unknown_video', 'jpg')
-        return key
+        """
+        returns a key in the format "[archiverName]_[filename]" includes extension
+        """
+        tail = os.path.split(filename)[1]  # returns filename.ext from full path
+        _id, extension = os.path.splitext(tail)  # returns [filename, .ext]
+        if 'unknown_video' in _id:
+            _id = _id.replace('unknown_video', 'jpg')
+        return f'{self.name}_{_id}{extension}'

    def get_thumbnails(self, filename, duration=None):
        if not os.path.exists(filename.split('.')[0]):
@ -80,10 +67,9 @@ class Archiver:
                thumbnail_filename = filename.split('.')[0] + '/' + fname
                key = filename.split('/')[1].split('.')[0] + '/' + fname

-                cdn_url = self.get_cdn_url(key)
+                cdn_url = self.storage.get_cdn_url(key)

-                with open(thumbnail_filename, 'rb') as f:
-                    self.do_s3_upload(f, key)
+                self.storage.upload(thumbnail_filename, key)

                cdn_urls.append(cdn_url)
                os.remove(thumbnail_filename)
@ -107,9 +93,8 @@ class Archiver:

        thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'

-        self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
-            'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
+        self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})

-        thumb_index_cdn_url = self.get_cdn_url(thumb_index)
+        thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)

        return (key_thumb, thumb_index_cdn_url)
--- a/archivers/telegram_archiver.py
+++ b/archivers/telegram_archiver.py
@ -1,15 +1,13 @@
 import os
 import requests
 from bs4 import BeautifulSoup
-from botocore.errorfactory import ClientError
-from .base_archiver import Archiver, ArchiveResult

-# TODO: get_cdn_url, get_thumbnails, do_s3_upload
+from .base_archiver import Archiver, ArchiveResult


 class TelegramArchiver(Archiver):
    name = "telegram"
-    
+
    def download(self, url, check_if_exists=False):
        # detect URLs that we definitely cannot handle
        if 'http://t.me/' not in url and 'https://t.me/' not in url:
@ -35,19 +33,13 @@ class TelegramArchiver(Archiver):

        video_url = video.get('src')
        key = video_url.split('/')[-1].split('?')[0]
+        key = self.get_key(key)
+
        filename = 'tmp/' + key

-        if check_if_exists:
-            try:
-                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                # file exists
-                cdn_url = self.get_cdn_url(key)
-
-                status = 'already archived'
-
-            except ClientError:
-                pass
+        if check_if_exists and self.storage.exists(key):
+            status = 'already archived'
+            cdn_url = self.storage.get_cdn_url(key)

        v = requests.get(video_url, headers=headers)

@ -55,10 +47,9 @@ class TelegramArchiver(Archiver):
            f.write(v.content)

        if status != 'already archived':
-            cdn_url = self.get_cdn_url(key)
+            cdn_url = self.storage.get_cdn_url(key)

-            with open(filename, 'rb') as f:
-                self.do_s3_upload(f, key)
+            self.storage.upload(filename, key)

        # extract duration from HTML
        duration = s.find_all('time')[0].contents[0]
--- a/archivers/tiktok_archiver.py
+++ b/archivers/tiktok_archiver.py
@ -1,15 +1,13 @@
 import os, traceback
-from botocore.errorfactory import ClientError
 import tiktok_downloader
 from loguru import logger
-from .base_archiver import Archiver, ArchiveResult

-# TODO: get_cdn_url, do_s3_upload, get_thumbnails
+from .base_archiver import Archiver, ArchiveResult


 class TiktokArchiver(Archiver):
    name = "tiktok"
-    
+
    def download(self, url, check_if_exists=False):
        if 'tiktok.com' not in url:
            return False
@ -18,35 +16,28 @@ class TiktokArchiver(Archiver):

        try:
            info = tiktok_downloader.info_post(url)
-            key = 'tiktok_' + str(info.id) + '.mp4'
+            key = self.get_key(f'{info.id}.mp4')
+            cdn_url = self.get_cdn_url(key)
            filename = 'tmp/' + key

-            if check_if_exists:
-                try:
-                    self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
+            if check_if_exists and self.storage.exists(key):
+                status = 'already archived'

-                    # file exists
-                    cdn_url = self.get_cdn_url(key)
+            media = tiktok_downloader.snaptik(url).get_media()

-                    status = 'already archived'
+            if len(media) <= 0:
+                if status == 'already archived':
+                    return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
+                else:
+                    return ArchiveResult(status='Could not download media')

-                except ClientError:
-                    pass
+            media[0].download(filename)

            if status != 'already archived':
-                media = tiktok_downloader.snaptik(url).get_media()
-                if len(media) > 0:
-                    media[0].download(filename)
-                    with open(filename, 'rb') as f:
-                        self.do_s3_upload(f, key)
-
-                    cdn_url = self.get_cdn_url(key)
-                else:
-                    status = 'could not download media'
+                self.storage.upload(filename, key)

            try:
-                key_thumb, thumb_index = self.get_thumbnails(
-                    filename, duration=info.duration)
+                key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration)
            except:
                key_thumb = ''
                thumb_index = 'error creating thumbnails'
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@ -1,14 +1,15 @@
 import time, requests, os
 from bs4 import BeautifulSoup

+from storages import Storage
 from .base_archiver import Archiver, ArchiveResult


 class WaybackArchiver(Archiver):
    name = "wayback"
-    
-    def __init__(self, s3_client):
-        self.s3 = s3_client
+
+    def __init__(self, storage: Storage):
+        super(WaybackArchiver, self).__init__(storage)
        self.seen_urls = {}

    def download(self, url, check_if_exists=False):
@ -26,10 +27,12 @@ class WaybackArchiver(Archiver):
        if r.status_code != 200:
            return ArchiveResult(status="Internet archive failed")

+        if 'job_id' not in r.json() and 'message' in r.json():
+            return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
+
        job_id = r.json()['job_id']

-        status_r = requests.get(
-            'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
+        status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers)

        retries = 0

@ -51,7 +54,7 @@ class WaybackArchiver(Archiver):
        status_json = status_r.json()

        if status_json['status'] != 'success':
-            return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
+            return ArchiveResult(status='Internet Archive failed: ' + str(status_json))

        archive_url = 'https://web.archive.org/web/' + \
            status_json['timestamp'] + '/' + status_json['original_url']
@ -59,15 +62,15 @@ class WaybackArchiver(Archiver):
        try:
            r = requests.get(archive_url)

-            parsed = BeautifulSoup(
-                r.content, 'html.parser')
+            parsed = BeautifulSoup(r.content, 'html.parser')

-            title = parsed.find_all('title')[
-                0].text
+            title = parsed.find_all('title')[0].text
+
+            if title == 'Wayback Machine':
+                title = 'Could not get title'
        except:
            title = "Could not get title"

-        result = ArchiveResult(
-            status='Internet Archive fallback', cdn_url=archive_url, title=title)
+        result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title)
        self.seen_urls[url] = result
        return result
--- a/archivers/youtubedl_archiver.py
+++ b/archivers/youtubedl_archiver.py
@ -3,12 +3,13 @@ import os
 import datetime
 import youtube_dl
 from loguru import logger
-from botocore.errorfactory import ClientError
+
 from .base_archiver import Archiver, ArchiveResult

+
 class YoutubeDLArchiver(Archiver):
    name = "yotube_dl"
-    
+
    def download(self, url, check_if_exists=False):
        ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
        if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
@ -32,8 +33,11 @@ class YoutubeDLArchiver(Archiver):
        if check_if_exists:
            if 'entries' in info:
                if len(info['entries']) > 1:
+                    logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
+                    return False
+                elif len(info['entries']) == 0:
                    logger.warning(
-                        'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+                        'YoutubeDLArchiver succeeded but did not find video')
                    return False

                filename = ydl.prepare_filename(info['entries'][0])
@ -42,20 +46,14 @@ class YoutubeDLArchiver(Archiver):

            key = self.get_key(filename)

-            try:
-                self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
-
-                # file exists
-                cdn_url = self.get_cdn_url(key)
-
+            if self.storage.exists(key):
                status = 'already archived'
-
-            except ClientError:
-                pass
+                cdn_url = self.storage.get_cdn_url(key)

        # sometimes this results in a different filename, so do this again
        info = ydl.extract_info(url, download=True)

+        # TODO: add support for multiple videos
        if 'entries' in info:
            if len(info['entries']) > 1:
                logger.warning(
@ -70,19 +68,24 @@ class YoutubeDLArchiver(Archiver):
            filename = filename.split('.')[0] + '.mkv'

        if status != 'already archived':
-            key = self. get_key(filename)
-            cdn_url = self.get_cdn_url(key)
+            key = self.get_key(filename)
+            cdn_url = self.storage.get_cdn_url(key)

-            with open(filename, 'rb') as f:
-                self.do_s3_upload(f, key)
+            self.storage.upload(filename, key)

        # get duration
        duration = info['duration'] if 'duration' in info else None

        # get thumbnails
-        key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+        try:
+            key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
+        except:
+            key_thumb = ''
+            thumb_index = 'Could not generate thumbnails'
+
        os.remove(filename)

+        timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
+
        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
-                             title=info['title'] if 'title' in info else None,
-                             timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
+                             title=info['title'] if 'title' in info else None, timestamp=timestamp)
--- a/auto_archive.py
+++ b/auto_archive.py
@ -2,12 +2,13 @@ import os
 import datetime
 import argparse
 import math
+import requests
 import gspread
-import boto3
 from loguru import logger
 from dotenv import load_dotenv

 import archivers
+from storages import S3Storage, S3Config

 load_dotenv()

@ -41,7 +42,7 @@ def index_to_col(index):
        return alphabet[index]


-def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
+def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
    update = []

    if columns['status'] is not None:
@ -103,19 +104,24 @@ def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
 def process_sheet(sheet):
    gc = gspread.service_account(filename='service_account.json')
    sh = gc.open(sheet)
-    n_worksheets = len(sh.worksheets())

-    s3_client = boto3.client('s3',
-                             region_name=os.getenv('DO_SPACES_REGION'),
-                             endpoint_url='https://{}.digitaloceanspaces.com'.format(
-                                 os.getenv('DO_SPACES_REGION')),
-                             aws_access_key_id=os.getenv('DO_SPACES_KEY'),
-                             aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
+    s3_config = S3Config(
+        bucket=os.getenv('DO_BUCKET'),
+        region=os.getenv('DO_SPACES_REGION'),
+        key=os.getenv('DO_SPACES_KEY'),
+        secret=os.getenv('DO_SPACES_SECRET')
+    )
+
+    # s3_client = boto3.client('s3',
+    #                          region_name=os.getenv('DO_SPACES_REGION'),
+    #                          endpoint_url='https://{}.digitaloceanspaces.com'.format(
+    #                              os.getenv('DO_SPACES_REGION')),
+    #                          aws_access_key_id=os.getenv('DO_SPACES_KEY'),
+    #                          aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))

    # loop through worksheets to check
-    for ii in range(n_worksheets):
-        logger.info("Opening worksheet " + str(ii))
-        wks = sh.get_worksheet(ii)
+    for ii, wks in enumerate(sh.worksheets()):
+        logger.info(f'Opening worksheet {ii}: "{wks.title}"')
        values = wks.get_all_values()

        headers = [v.lower() for v in values[0]]
@ -126,7 +132,7 @@ def process_sheet(sheet):
                'source url')) if 'source url' in headers else None

        if columns['url'] is None:
-            logger.warning("No 'Media URL' column found, skipping")
+            logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
            continue

        url_index = col_to_index(columns['url'])
@ -153,6 +159,9 @@ def process_sheet(sheet):
        columns['duration'] = index_to_col(headers.index(
            'duration')) if 'duration' in headers else None

+        # archives will be in a folder 'doc_name/worksheet_name'
+        s3_config.folder = f'{sheet}/{wks.title}/'
+        s3_client = S3Storage(s3_config)

        # order matters, first to succeed excludes remaining
        active_archivers = [
@ -162,37 +171,43 @@ def process_sheet(sheet):
            archivers.WaybackArchiver(s3_client)
        ]

-
        # loop through rows in worksheet
-        for i in range(2, len(values)+1):
-            v = values[i-1]
+        for i in range(2, len(values) + 1):
+            v = values[i - 1]
+            url = v[url_index]

-            if v[url_index] != "" and v[col_to_index(columns['status'])] == "":
-                latest_val = wks.acell(
-                    columns['status'] + str(i)).value
+            if url != "" and v[col_to_index(columns['status'])] == "":
+                latest_val = wks.acell(columns['status'] + str(i)).value

                # check so we don't step on each others' toes
                if latest_val == '' or latest_val is None:
-                    wks.update(
-                        columns['status'] + str(i), 'Archive in progress')
+                    wks.update(columns['status'] + str(i), 'Archive in progress')
+
+                    # expand short URL links
+                    if 'https://t.co/' in url:
+                        r = requests.get(url)
+                        url = r.url

                    for archiver in active_archivers:
                        logger.debug(f"Trying {archiver} on row {i}")
-                        result = archiver.download(v[url_index], check_if_exists=True)
+
+                        result = archiver.download(url, check_if_exists=True)
+
                        if result:
-                            logger.info(f"{archiver} succeeded on row {i}")
+                            logger.success(f"{archiver} succeeded on row {i}")
                            break

                    if result:
                        update_sheet(wks, i, result, columns, v)
+                    else:
+                        wks.update(columns['status'] + str(i), 'failed: no archiver')

+                    # except:
+                    # if any unexpected errors occured, log these into the Google Sheet
+                    # t, value, traceback = sys.exc_info()

-                        # except:
-                            # if any unexpected errors occured, log these into the Google Sheet
-                            # t, value, traceback = sys.exc_info()
-
-                            # update_sheet(wks, i, str(
-                            #     value), {}, columns, v)
+                    # update_sheet(wks, i, str(
+                    #     value), {}, columns, v)


 def main():
--- a/storages/init.py
+++ b/storages/init.py
@ -0,0 +1,3 @@
+# we need to explicitly expose the available imports here
+from .base_storage import *
+from .s3_storage import *
--- a/storages/base_storage.py
+++ b/storages/base_storage.py
@ -0,0 +1,19 @@
+from abc import ABC, abstractmethod
+
+
+class Storage(ABC):
+    @abstractmethod
+    def __init__(self, config): pass
+
+    @abstractmethod
+    def get_cdn_url(self, path): pass
+
+    @abstractmethod
+    def exists(self, path): pass
+
+    @abstractmethod
+    def uploadf(self, file, key, **kwargs): pass
+
+    def upload(self, filename: str, key: str, **kwargs):
+        with open(filename, 'rb') as f:
+            self.uploadf(f, key, **kwargs)
--- a/storages/s3_storage.py
+++ b/storages/s3_storage.py
@ -0,0 +1,49 @@
+import boto3
+from botocore.errorfactory import ClientError
+from .base_storage import Storage
+from dataclasses import dataclass
+
+
+@dataclass
+class S3Config:
+    bucket: str
+    region: str
+    key: str
+    secret: str
+    folder: str = ""
+
+
+class S3Storage(Storage):
+
+    def __init__(self, config: S3Config):
+        self.bucket = config.bucket
+        self.region = config.region
+        self.folder = config.folder
+
+        if len(self.folder) and self.folder[-1] != '/':
+            self.folder += '/'
+
+        self.s3 = boto3.client(
+            's3',
+            region_name=self.region,
+            endpoint_url=f'https://{self.region}.digitaloceanspaces.com',
+            aws_access_key_id=config.key,
+            aws_secret_access_key=config.secret
+        )
+
+    def _get_path(self, key):
+        return self.folder + key
+
+    def get_cdn_url(self, key):
+        return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
+
+    def exists(self, key):
+        try:
+            self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
+            return True
+        except ClientError:
+            return False
+
+    def uploadf(self, file, key, **kwargs):
+        extra_args = kwargs["extra_args"] if "extra_args" in kwargs else {'ACL': 'public-read'}
+        self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)