From 07b5d357b478b892313f8813e4fb77764fd811c9 Mon Sep 17 00:00:00 2001 From: Logan Williams Date: Tue, 22 Feb 2022 08:20:45 +0100 Subject: [PATCH] Fix bugs in WaybackArchiver, follow redirects sometimes --- archivers.py | 48 +++++++++++++++++++++++++++++++++++------------- auto_archive.py | 30 +++++++++++++++++++----------- 2 files changed, 54 insertions(+), 24 deletions(-) diff --git a/archivers.py b/archivers.py index d8a72f6..7c8df8c 100644 --- a/archivers.py +++ b/archivers.py @@ -210,7 +210,11 @@ class YoutubeDLArchiver(Archiver): if 'entries' in info: if len(info['entries']) > 1: logger.warning( - 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') + 'YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos') + return False + elif len(info['entries']) == 0: + logger.warning( + 'YoutubeDLArchiver succeeded but did not find video') return False filename = ydl.prepare_filename(info['entries'][0]) @@ -257,13 +261,21 @@ class YoutubeDLArchiver(Archiver): duration = info['duration'] if 'duration' in info else None # get thumbnails - key_thumb, thumb_index = get_thumbnails( - filename, self.s3, duration=duration) + try: + key_thumb, thumb_index = get_thumbnails( + filename, self.s3, duration=duration) + except: + key_thumb = '' + thumb_index = 'Could not generate thumbnails' + os.remove(filename) + timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime( + info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None + return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, title=info['title'] if 'title' in info else None, - timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None) + timestamp=timestamp) class WaybackArchiver(Archiver): @@ -286,6 +298,9 @@ class WaybackArchiver(Archiver): if r.status_code != 200: return ArchiveResult(status="Internet archive failed") + if 'job_id' not in r.json() and 'message' in r.json(): + return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}") + job_id = r.json()['job_id'] status_r = requests.get( @@ -311,7 +326,7 @@ class WaybackArchiver(Archiver): status_json = status_r.json() if status_json['status'] != 'success': - return ArchiveResult(status='Internet Archive failed: ' + status_json['message']) + return ArchiveResult(status='Internet Archive failed: ' + str(status_json)) archive_url = 'https://web.archive.org/web/' + \ status_json['timestamp'] + '/' + status_json['original_url'] @@ -324,6 +339,9 @@ class WaybackArchiver(Archiver): title = parsed.find_all('title')[ 0].text + + if title == 'Wayback Machine': + title = 'Could not get title' except: title = "Could not get title" @@ -343,6 +361,7 @@ class TiktokArchiver(Archiver): try: info = tiktok_downloader.info_post(url) key = 'tiktok_' + str(info.id) + '.mp4' + cdn_url = get_cdn_url(key) filename = 'tmp/' + key if check_if_exists: @@ -357,16 +376,19 @@ class TiktokArchiver(Archiver): except ClientError: pass - if status != 'already archived': - media = tiktok_downloader.snaptik(url).get_media() - if len(media) > 0: - media[0].download(filename) - with open(filename, 'rb') as f: - do_s3_upload(self.s3, f, key) + media = tiktok_downloader.snaptik(url).get_media() - cdn_url = get_cdn_url(key) + if len(media) <= 0: + if status == 'already archived': + return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url) else: - status = 'could not download media' + return ArchiveResult(status='Could not download media') + + media[0].download(filename) + + if status != 'already archived': + with open(filename, 'rb') as f: + do_s3_upload(self.s3, f, key) try: key_thumb, thumb_index = get_thumbnails( diff --git a/auto_archive.py b/auto_archive.py index ef4f89c..fe2ccfd 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -10,6 +10,7 @@ import math import threading from loguru import logger import archivers +import requests load_dotenv() @@ -43,7 +44,7 @@ def index_to_col(index): return alphabet[index] -def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v): +def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v): update = [] if columns['status'] is not None: @@ -155,7 +156,6 @@ def process_sheet(sheet): columns['duration'] = index_to_col(headers.index( 'duration')) if 'duration' in headers else None - active_archivers = [ archivers.TelegramArchiver(s3_client), archivers.TiktokArchiver(s3_client), @@ -163,7 +163,6 @@ def process_sheet(sheet): archivers.WaybackArchiver(s3_client) ] - # loop through rows in worksheet for i in range(2, len(values)+1): v = values[i-1] @@ -174,26 +173,35 @@ def process_sheet(sheet): # check so we don't step on each others' toes if latest_val == '' or latest_val is None: - wks.update( - columns['status'] + str(i), 'Archive in progress') + wks.update(columns['status'] + str(i), + 'Archive in progress') for archiver in active_archivers: logger.debug(f"Trying {archiver} on row {i}") - result = archiver.download(v[url_index], check_if_exists=True) + + url = v[url_index] + # expand short URL links + if 'https://t.co/' in url: + r = requests.get(url) + url = r.url + + result = archiver.download(url, check_if_exists=True) if result: logger.info(f"{archiver} succeeded on row {i}") break if result: update_sheet(wks, i, result, columns, v) - + else: + wks.update(columns['status'] + + str(i), 'failed: no archiver') # except: - # if any unexpected errors occured, log these into the Google Sheet - # t, value, traceback = sys.exc_info() + # if any unexpected errors occured, log these into the Google Sheet + # t, value, traceback = sys.exc_info() - # update_sheet(wks, i, str( - # value), {}, columns, v) + # update_sheet(wks, i, str( + # value), {}, columns, v) def main():