From 07b5d357b478b892313f8813e4fb77764fd811c9 Mon Sep 17 00:00:00 2001
From: Logan Williams <logan.williams@alum.mit.edu>
Date: Tue, 22 Feb 2022 08:20:45 +0100
Subject: [PATCH] Fix bugs in WaybackArchiver, follow redirects sometimes

---
 archivers.py    | 48 +++++++++++++++++++++++++++++++++++-------------
 auto_archive.py | 30 +++++++++++++++++++-----------
 2 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/archivers.py b/archivers.py
index d8a72f6..7c8df8c 100644
--- a/archivers.py
+++ b/archivers.py
@@ -210,7 +210,11 @@ class YoutubeDLArchiver(Archiver):
             if 'entries' in info:
                 if len(info['entries']) > 1:
                     logger.warning(
-                        'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+                        'YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
+                    return False
+                elif len(info['entries']) == 0:
+                    logger.warning(
+                        'YoutubeDLArchiver succeeded but did not find video')
                     return False
 
                 filename = ydl.prepare_filename(info['entries'][0])
@@ -257,13 +261,21 @@ class YoutubeDLArchiver(Archiver):
         duration = info['duration'] if 'duration' in info else None
 
         # get thumbnails
-        key_thumb, thumb_index = get_thumbnails(
-            filename, self.s3, duration=duration)
+        try:
+            key_thumb, thumb_index = get_thumbnails(
+                filename, self.s3, duration=duration)
+        except:
+            key_thumb = ''
+            thumb_index = 'Could not generate thumbnails'
+
         os.remove(filename)
 
+        timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(
+            info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
+
         return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
                              title=info['title'] if 'title' in info else None,
-                             timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
+                             timestamp=timestamp)
 
 
 class WaybackArchiver(Archiver):
@@ -286,6 +298,9 @@ class WaybackArchiver(Archiver):
         if r.status_code != 200:
             return ArchiveResult(status="Internet archive failed")
 
+        if 'job_id' not in r.json() and 'message' in r.json():
+            return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
+
         job_id = r.json()['job_id']
 
         status_r = requests.get(
@@ -311,7 +326,7 @@ class WaybackArchiver(Archiver):
         status_json = status_r.json()
 
         if status_json['status'] != 'success':
-            return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
+            return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
 
         archive_url = 'https://web.archive.org/web/' + \
             status_json['timestamp'] + '/' + status_json['original_url']
@@ -324,6 +339,9 @@ class WaybackArchiver(Archiver):
 
             title = parsed.find_all('title')[
                 0].text
+
+            if title == 'Wayback Machine':
+                title = 'Could not get title'
         except:
             title = "Could not get title"
 
@@ -343,6 +361,7 @@ class TiktokArchiver(Archiver):
         try:
             info = tiktok_downloader.info_post(url)
             key = 'tiktok_' + str(info.id) + '.mp4'
+            cdn_url = get_cdn_url(key)
             filename = 'tmp/' + key
 
             if check_if_exists:
@@ -357,16 +376,19 @@ class TiktokArchiver(Archiver):
                 except ClientError:
                     pass
 
-            if status != 'already archived':
-                media = tiktok_downloader.snaptik(url).get_media()
-                if len(media) > 0:
-                    media[0].download(filename)
-                    with open(filename, 'rb') as f:
-                        do_s3_upload(self.s3, f, key)
+            media = tiktok_downloader.snaptik(url).get_media()
 
-                    cdn_url = get_cdn_url(key)
+            if len(media) <= 0:
+                if status == 'already archived':
+                    return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
                 else:
-                    status = 'could not download media'
+                    return ArchiveResult(status='Could not download media')
+
+            media[0].download(filename)
+
+            if status != 'already archived':
+                with open(filename, 'rb') as f:
+                    do_s3_upload(self.s3, f, key)
 
             try:
                 key_thumb, thumb_index = get_thumbnails(
diff --git a/auto_archive.py b/auto_archive.py
index ef4f89c..fe2ccfd 100644
--- a/auto_archive.py
+++ b/auto_archive.py
@@ -10,6 +10,7 @@ import math
 import threading
 from loguru import logger
 import archivers
+import requests
 
 load_dotenv()
 
@@ -43,7 +44,7 @@ def index_to_col(index):
         return alphabet[index]
 
 
-def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
+def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
     update = []
 
     if columns['status'] is not None:
@@ -155,7 +156,6 @@ def process_sheet(sheet):
         columns['duration'] = index_to_col(headers.index(
             'duration')) if 'duration' in headers else None
 
-
         active_archivers = [
             archivers.TelegramArchiver(s3_client),
             archivers.TiktokArchiver(s3_client),
@@ -163,7 +163,6 @@ def process_sheet(sheet):
             archivers.WaybackArchiver(s3_client)
         ]
 
-
         # loop through rows in worksheet
         for i in range(2, len(values)+1):
             v = values[i-1]
@@ -174,26 +173,35 @@ def process_sheet(sheet):
 
                 # check so we don't step on each others' toes
                 if latest_val == '' or latest_val is None:
-                    wks.update(
-                        columns['status'] + str(i), 'Archive in progress')
+                    wks.update(columns['status'] + str(i),
+                               'Archive in progress')
 
                     for archiver in active_archivers:
                         logger.debug(f"Trying {archiver} on row {i}")
-                        result = archiver.download(v[url_index], check_if_exists=True)
+
+                        url = v[url_index]
+                        # expand short URL links
+                        if 'https://t.co/' in url:
+                            r = requests.get(url)
+                            url = r.url
+
+                        result = archiver.download(url, check_if_exists=True)
                         if result:
                             logger.info(f"{archiver} succeeded on row {i}")
                             break
 
                     if result:
                         update_sheet(wks, i, result, columns, v)
-
+                    else:
+                        wks.update(columns['status'] +
+                                   str(i), 'failed: no archiver')
 
                         # except:
-                            # if any unexpected errors occured, log these into the Google Sheet
-                            # t, value, traceback = sys.exc_info()
+                        # if any unexpected errors occured, log these into the Google Sheet
+                        # t, value, traceback = sys.exc_info()
 
-                            # update_sheet(wks, i, str(
-                            #     value), {}, columns, v)
+                        # update_sheet(wks, i, str(
+                        #     value), {}, columns, v)
 
 
 def main():