Fix bugs in WaybackArchiver, follow redirects sometimes

pull/13/head
Logan Williams 2022-02-22 08:20:45 +01:00
rodzic 009c0dd8ca
commit 07b5d357b4
2 zmienionych plików z 54 dodań i 24 usunięć

Wyświetl plik

@ -210,7 +210,11 @@ class YoutubeDLArchiver(Archiver):
if 'entries' in info: if 'entries' in info:
if len(info['entries']) > 1: if len(info['entries']) > 1:
logger.warning( logger.warning(
'YoutubeDLArchiver cannot archive channels or pages with multiple videos') 'YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
return False
elif len(info['entries']) == 0:
logger.warning(
'YoutubeDLArchiver succeeded but did not find video')
return False return False
filename = ydl.prepare_filename(info['entries'][0]) filename = ydl.prepare_filename(info['entries'][0])
@ -257,13 +261,21 @@ class YoutubeDLArchiver(Archiver):
duration = info['duration'] if 'duration' in info else None duration = info['duration'] if 'duration' in info else None
# get thumbnails # get thumbnails
key_thumb, thumb_index = get_thumbnails( try:
filename, self.s3, duration=duration) key_thumb, thumb_index = get_thumbnails(
filename, self.s3, duration=duration)
except:
key_thumb = ''
thumb_index = 'Could not generate thumbnails'
os.remove(filename) os.remove(filename)
timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(
info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None, title=info['title'] if 'title' in info else None,
timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None) timestamp=timestamp)
class WaybackArchiver(Archiver): class WaybackArchiver(Archiver):
@ -286,6 +298,9 @@ class WaybackArchiver(Archiver):
if r.status_code != 200: if r.status_code != 200:
return ArchiveResult(status="Internet archive failed") return ArchiveResult(status="Internet archive failed")
if 'job_id' not in r.json() and 'message' in r.json():
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
job_id = r.json()['job_id'] job_id = r.json()['job_id']
status_r = requests.get( status_r = requests.get(
@ -311,7 +326,7 @@ class WaybackArchiver(Archiver):
status_json = status_r.json() status_json = status_r.json()
if status_json['status'] != 'success': if status_json['status'] != 'success':
return ArchiveResult(status='Internet Archive failed: ' + status_json['message']) return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
archive_url = 'https://web.archive.org/web/' + \ archive_url = 'https://web.archive.org/web/' + \
status_json['timestamp'] + '/' + status_json['original_url'] status_json['timestamp'] + '/' + status_json['original_url']
@ -324,6 +339,9 @@ class WaybackArchiver(Archiver):
title = parsed.find_all('title')[ title = parsed.find_all('title')[
0].text 0].text
if title == 'Wayback Machine':
title = 'Could not get title'
except: except:
title = "Could not get title" title = "Could not get title"
@ -343,6 +361,7 @@ class TiktokArchiver(Archiver):
try: try:
info = tiktok_downloader.info_post(url) info = tiktok_downloader.info_post(url)
key = 'tiktok_' + str(info.id) + '.mp4' key = 'tiktok_' + str(info.id) + '.mp4'
cdn_url = get_cdn_url(key)
filename = 'tmp/' + key filename = 'tmp/' + key
if check_if_exists: if check_if_exists:
@ -357,16 +376,19 @@ class TiktokArchiver(Archiver):
except ClientError: except ClientError:
pass pass
if status != 'already archived': media = tiktok_downloader.snaptik(url).get_media()
media = tiktok_downloader.snaptik(url).get_media()
if len(media) > 0:
media[0].download(filename)
with open(filename, 'rb') as f:
do_s3_upload(self.s3, f, key)
cdn_url = get_cdn_url(key) if len(media) <= 0:
if status == 'already archived':
return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
else: else:
status = 'could not download media' return ArchiveResult(status='Could not download media')
media[0].download(filename)
if status != 'already archived':
with open(filename, 'rb') as f:
do_s3_upload(self.s3, f, key)
try: try:
key_thumb, thumb_index = get_thumbnails( key_thumb, thumb_index = get_thumbnails(

Wyświetl plik

@ -10,6 +10,7 @@ import math
import threading import threading
from loguru import logger from loguru import logger
import archivers import archivers
import requests
load_dotenv() load_dotenv()
@ -43,7 +44,7 @@ def index_to_col(index):
return alphabet[index] return alphabet[index]
def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v): def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
update = [] update = []
if columns['status'] is not None: if columns['status'] is not None:
@ -155,7 +156,6 @@ def process_sheet(sheet):
columns['duration'] = index_to_col(headers.index( columns['duration'] = index_to_col(headers.index(
'duration')) if 'duration' in headers else None 'duration')) if 'duration' in headers else None
active_archivers = [ active_archivers = [
archivers.TelegramArchiver(s3_client), archivers.TelegramArchiver(s3_client),
archivers.TiktokArchiver(s3_client), archivers.TiktokArchiver(s3_client),
@ -163,7 +163,6 @@ def process_sheet(sheet):
archivers.WaybackArchiver(s3_client) archivers.WaybackArchiver(s3_client)
] ]
# loop through rows in worksheet # loop through rows in worksheet
for i in range(2, len(values)+1): for i in range(2, len(values)+1):
v = values[i-1] v = values[i-1]
@ -174,26 +173,35 @@ def process_sheet(sheet):
# check so we don't step on each others' toes # check so we don't step on each others' toes
if latest_val == '' or latest_val is None: if latest_val == '' or latest_val is None:
wks.update( wks.update(columns['status'] + str(i),
columns['status'] + str(i), 'Archive in progress') 'Archive in progress')
for archiver in active_archivers: for archiver in active_archivers:
logger.debug(f"Trying {archiver} on row {i}") logger.debug(f"Trying {archiver} on row {i}")
result = archiver.download(v[url_index], check_if_exists=True)
url = v[url_index]
# expand short URL links
if 'https://t.co/' in url:
r = requests.get(url)
url = r.url
result = archiver.download(url, check_if_exists=True)
if result: if result:
logger.info(f"{archiver} succeeded on row {i}") logger.info(f"{archiver} succeeded on row {i}")
break break
if result: if result:
update_sheet(wks, i, result, columns, v) update_sheet(wks, i, result, columns, v)
else:
wks.update(columns['status'] +
str(i), 'failed: no archiver')
# except: # except:
# if any unexpected errors occured, log these into the Google Sheet # if any unexpected errors occured, log these into the Google Sheet
# t, value, traceback = sys.exc_info() # t, value, traceback = sys.exc_info()
# update_sheet(wks, i, str( # update_sheet(wks, i, str(
# value), {}, columns, v) # value), {}, columns, v)
def main(): def main():