Fix bugs in WaybackArchiver, follow redirects sometimes

2022-02-22 08:20:45 +01:00 · 2022-02-22 08:20:45 +01:00 · 07b5d357b4
commit 07b5d357b4
--- a/archivers.py
+++ b/archivers.py
@ -210,7 +210,11 @@ class YoutubeDLArchiver(Archiver):
            if 'entries' in info:
                if len(info['entries']) > 1:
                    logger.warning(
-                        'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
+                        'YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
+                    return False
+                elif len(info['entries']) == 0:
+                    logger.warning(
+                        'YoutubeDLArchiver succeeded but did not find video')
                    return False

                filename = ydl.prepare_filename(info['entries'][0])
@ -257,13 +261,21 @@ class YoutubeDLArchiver(Archiver):
        duration = info['duration'] if 'duration' in info else None

        # get thumbnails
-        key_thumb, thumb_index = get_thumbnails(
-            filename, self.s3, duration=duration)
+        try:
+            key_thumb, thumb_index = get_thumbnails(
+                filename, self.s3, duration=duration)
+        except:
+            key_thumb = ''
+            thumb_index = 'Could not generate thumbnails'
+
        os.remove(filename)

+        timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(
+            info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
+
        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
                             title=info['title'] if 'title' in info else None,
-                             timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
+                             timestamp=timestamp)


 class WaybackArchiver(Archiver):
@ -286,6 +298,9 @@ class WaybackArchiver(Archiver):
        if r.status_code != 200:
            return ArchiveResult(status="Internet archive failed")

+        if 'job_id' not in r.json() and 'message' in r.json():
+            return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
+
        job_id = r.json()['job_id']

        status_r = requests.get(
@ -311,7 +326,7 @@ class WaybackArchiver(Archiver):
        status_json = status_r.json()

        if status_json['status'] != 'success':
-            return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
+            return ArchiveResult(status='Internet Archive failed: ' + str(status_json))

        archive_url = 'https://web.archive.org/web/' + \
            status_json['timestamp'] + '/' + status_json['original_url']
@ -324,6 +339,9 @@ class WaybackArchiver(Archiver):

            title = parsed.find_all('title')[
                0].text
+
+            if title == 'Wayback Machine':
+                title = 'Could not get title'
        except:
            title = "Could not get title"

@ -343,6 +361,7 @@ class TiktokArchiver(Archiver):
        try:
            info = tiktok_downloader.info_post(url)
            key = 'tiktok_' + str(info.id) + '.mp4'
+            cdn_url = get_cdn_url(key)
            filename = 'tmp/' + key

            if check_if_exists:
@ -357,16 +376,19 @@ class TiktokArchiver(Archiver):
                except ClientError:
                    pass

-            if status != 'already archived':
-                media = tiktok_downloader.snaptik(url).get_media()
-                if len(media) > 0:
-                    media[0].download(filename)
-                    with open(filename, 'rb') as f:
-                        do_s3_upload(self.s3, f, key)
+            media = tiktok_downloader.snaptik(url).get_media()

-                    cdn_url = get_cdn_url(key)
+            if len(media) <= 0:
+                if status == 'already archived':
+                    return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
                else:
-                    status = 'could not download media'
+                    return ArchiveResult(status='Could not download media')
+
+            media[0].download(filename)
+
+            if status != 'already archived':
+                with open(filename, 'rb') as f:
+                    do_s3_upload(self.s3, f, key)

            try:
                key_thumb, thumb_index = get_thumbnails(
--- a/auto_archive.py
+++ b/auto_archive.py
@ -10,6 +10,7 @@ import math
 import threading
 from loguru import logger
 import archivers
+import requests

 load_dotenv()

@ -43,7 +44,7 @@ def index_to_col(index):
        return alphabet[index]


-def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
+def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
    update = []

    if columns['status'] is not None:
@ -155,7 +156,6 @@ def process_sheet(sheet):
        columns['duration'] = index_to_col(headers.index(
            'duration')) if 'duration' in headers else None

-
        active_archivers = [
            archivers.TelegramArchiver(s3_client),
            archivers.TiktokArchiver(s3_client),
@ -163,7 +163,6 @@ def process_sheet(sheet):
            archivers.WaybackArchiver(s3_client)
        ]

-
        # loop through rows in worksheet
        for i in range(2, len(values)+1):
            v = values[i-1]
@ -174,26 +173,35 @@ def process_sheet(sheet):

                # check so we don't step on each others' toes
                if latest_val == '' or latest_val is None:
-                    wks.update(
-                        columns['status'] + str(i), 'Archive in progress')
+                    wks.update(columns['status'] + str(i),
+                               'Archive in progress')

                    for archiver in active_archivers:
                        logger.debug(f"Trying {archiver} on row {i}")
-                        result = archiver.download(v[url_index], check_if_exists=True)
+
+                        url = v[url_index]
+                        # expand short URL links
+                        if 'https://t.co/' in url:
+                            r = requests.get(url)
+                            url = r.url
+
+                        result = archiver.download(url, check_if_exists=True)
                        if result:
                            logger.info(f"{archiver} succeeded on row {i}")
                            break

                    if result:
                        update_sheet(wks, i, result, columns, v)
-
+                    else:
+                        wks.update(columns['status'] +
+                                   str(i), 'failed: no archiver')

                        # except:
-                            # if any unexpected errors occured, log these into the Google Sheet
-                            # t, value, traceback = sys.exc_info()
+                        # if any unexpected errors occured, log these into the Google Sheet
+                        # t, value, traceback = sys.exc_info()

-                            # update_sheet(wks, i, str(
-                            #     value), {}, columns, v)
+                        # update_sheet(wks, i, str(
+                        #     value), {}, columns, v)


 def main():