Fix bugs in WaybackArchiver, follow redirects sometimes

pull/13/head
Logan Williams 2022-02-22 08:20:45 +01:00
rodzic 009c0dd8ca
commit 07b5d357b4
2 zmienionych plików z 54 dodań i 24 usunięć

Wyświetl plik

@ -210,7 +210,11 @@ class YoutubeDLArchiver(Archiver):
if 'entries' in info:
if len(info['entries']) > 1:
logger.warning(
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
'YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
return False
elif len(info['entries']) == 0:
logger.warning(
'YoutubeDLArchiver succeeded but did not find video')
return False
filename = ydl.prepare_filename(info['entries'][0])
@ -257,13 +261,21 @@ class YoutubeDLArchiver(Archiver):
duration = info['duration'] if 'duration' in info else None
# get thumbnails
key_thumb, thumb_index = get_thumbnails(
filename, self.s3, duration=duration)
try:
key_thumb, thumb_index = get_thumbnails(
filename, self.s3, duration=duration)
except:
key_thumb = ''
thumb_index = 'Could not generate thumbnails'
os.remove(filename)
timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(
info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None,
timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
timestamp=timestamp)
class WaybackArchiver(Archiver):
@ -286,6 +298,9 @@ class WaybackArchiver(Archiver):
if r.status_code != 200:
return ArchiveResult(status="Internet archive failed")
if 'job_id' not in r.json() and 'message' in r.json():
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
job_id = r.json()['job_id']
status_r = requests.get(
@ -311,7 +326,7 @@ class WaybackArchiver(Archiver):
status_json = status_r.json()
if status_json['status'] != 'success':
return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
archive_url = 'https://web.archive.org/web/' + \
status_json['timestamp'] + '/' + status_json['original_url']
@ -324,6 +339,9 @@ class WaybackArchiver(Archiver):
title = parsed.find_all('title')[
0].text
if title == 'Wayback Machine':
title = 'Could not get title'
except:
title = "Could not get title"
@ -343,6 +361,7 @@ class TiktokArchiver(Archiver):
try:
info = tiktok_downloader.info_post(url)
key = 'tiktok_' + str(info.id) + '.mp4'
cdn_url = get_cdn_url(key)
filename = 'tmp/' + key
if check_if_exists:
@ -357,16 +376,19 @@ class TiktokArchiver(Archiver):
except ClientError:
pass
if status != 'already archived':
media = tiktok_downloader.snaptik(url).get_media()
if len(media) > 0:
media[0].download(filename)
with open(filename, 'rb') as f:
do_s3_upload(self.s3, f, key)
media = tiktok_downloader.snaptik(url).get_media()
cdn_url = get_cdn_url(key)
if len(media) <= 0:
if status == 'already archived':
return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
else:
status = 'could not download media'
return ArchiveResult(status='Could not download media')
media[0].download(filename)
if status != 'already archived':
with open(filename, 'rb') as f:
do_s3_upload(self.s3, f, key)
try:
key_thumb, thumb_index = get_thumbnails(

Wyświetl plik

@ -10,6 +10,7 @@ import math
import threading
from loguru import logger
import archivers
import requests
load_dotenv()
@ -43,7 +44,7 @@ def index_to_col(index):
return alphabet[index]
def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
update = []
if columns['status'] is not None:
@ -155,7 +156,6 @@ def process_sheet(sheet):
columns['duration'] = index_to_col(headers.index(
'duration')) if 'duration' in headers else None
active_archivers = [
archivers.TelegramArchiver(s3_client),
archivers.TiktokArchiver(s3_client),
@ -163,7 +163,6 @@ def process_sheet(sheet):
archivers.WaybackArchiver(s3_client)
]
# loop through rows in worksheet
for i in range(2, len(values)+1):
v = values[i-1]
@ -174,26 +173,35 @@ def process_sheet(sheet):
# check so we don't step on each others' toes
if latest_val == '' or latest_val is None:
wks.update(
columns['status'] + str(i), 'Archive in progress')
wks.update(columns['status'] + str(i),
'Archive in progress')
for archiver in active_archivers:
logger.debug(f"Trying {archiver} on row {i}")
result = archiver.download(v[url_index], check_if_exists=True)
url = v[url_index]
# expand short URL links
if 'https://t.co/' in url:
r = requests.get(url)
url = r.url
result = archiver.download(url, check_if_exists=True)
if result:
logger.info(f"{archiver} succeeded on row {i}")
break
if result:
update_sheet(wks, i, result, columns, v)
else:
wks.update(columns['status'] +
str(i), 'failed: no archiver')
# except:
# if any unexpected errors occured, log these into the Google Sheet
# t, value, traceback = sys.exc_info()
# if any unexpected errors occured, log these into the Google Sheet
# t, value, traceback = sys.exc_info()
# update_sheet(wks, i, str(
# value), {}, columns, v)
# update_sheet(wks, i, str(
# value), {}, columns, v)
def main():