kopia lustrzana https://github.com/bellingcat/auto-archiver
Fix bugs in WaybackArchiver, follow redirects sometimes
rodzic
009c0dd8ca
commit
07b5d357b4
48
archivers.py
48
archivers.py
|
@ -210,7 +210,11 @@ class YoutubeDLArchiver(Archiver):
|
|||
if 'entries' in info:
|
||||
if len(info['entries']) > 1:
|
||||
logger.warning(
|
||||
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
|
||||
'YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
|
||||
return False
|
||||
elif len(info['entries']) == 0:
|
||||
logger.warning(
|
||||
'YoutubeDLArchiver succeeded but did not find video')
|
||||
return False
|
||||
|
||||
filename = ydl.prepare_filename(info['entries'][0])
|
||||
|
@ -257,13 +261,21 @@ class YoutubeDLArchiver(Archiver):
|
|||
duration = info['duration'] if 'duration' in info else None
|
||||
|
||||
# get thumbnails
|
||||
key_thumb, thumb_index = get_thumbnails(
|
||||
filename, self.s3, duration=duration)
|
||||
try:
|
||||
key_thumb, thumb_index = get_thumbnails(
|
||||
filename, self.s3, duration=duration)
|
||||
except:
|
||||
key_thumb = ''
|
||||
thumb_index = 'Could not generate thumbnails'
|
||||
|
||||
os.remove(filename)
|
||||
|
||||
timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(
|
||||
info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
|
||||
|
||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
||||
title=info['title'] if 'title' in info else None,
|
||||
timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
|
||||
timestamp=timestamp)
|
||||
|
||||
|
||||
class WaybackArchiver(Archiver):
|
||||
|
@ -286,6 +298,9 @@ class WaybackArchiver(Archiver):
|
|||
if r.status_code != 200:
|
||||
return ArchiveResult(status="Internet archive failed")
|
||||
|
||||
if 'job_id' not in r.json() and 'message' in r.json():
|
||||
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
|
||||
|
||||
job_id = r.json()['job_id']
|
||||
|
||||
status_r = requests.get(
|
||||
|
@ -311,7 +326,7 @@ class WaybackArchiver(Archiver):
|
|||
status_json = status_r.json()
|
||||
|
||||
if status_json['status'] != 'success':
|
||||
return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
|
||||
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
|
||||
|
||||
archive_url = 'https://web.archive.org/web/' + \
|
||||
status_json['timestamp'] + '/' + status_json['original_url']
|
||||
|
@ -324,6 +339,9 @@ class WaybackArchiver(Archiver):
|
|||
|
||||
title = parsed.find_all('title')[
|
||||
0].text
|
||||
|
||||
if title == 'Wayback Machine':
|
||||
title = 'Could not get title'
|
||||
except:
|
||||
title = "Could not get title"
|
||||
|
||||
|
@ -343,6 +361,7 @@ class TiktokArchiver(Archiver):
|
|||
try:
|
||||
info = tiktok_downloader.info_post(url)
|
||||
key = 'tiktok_' + str(info.id) + '.mp4'
|
||||
cdn_url = get_cdn_url(key)
|
||||
filename = 'tmp/' + key
|
||||
|
||||
if check_if_exists:
|
||||
|
@ -357,16 +376,19 @@ class TiktokArchiver(Archiver):
|
|||
except ClientError:
|
||||
pass
|
||||
|
||||
if status != 'already archived':
|
||||
media = tiktok_downloader.snaptik(url).get_media()
|
||||
if len(media) > 0:
|
||||
media[0].download(filename)
|
||||
with open(filename, 'rb') as f:
|
||||
do_s3_upload(self.s3, f, key)
|
||||
media = tiktok_downloader.snaptik(url).get_media()
|
||||
|
||||
cdn_url = get_cdn_url(key)
|
||||
if len(media) <= 0:
|
||||
if status == 'already archived':
|
||||
return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
|
||||
else:
|
||||
status = 'could not download media'
|
||||
return ArchiveResult(status='Could not download media')
|
||||
|
||||
media[0].download(filename)
|
||||
|
||||
if status != 'already archived':
|
||||
with open(filename, 'rb') as f:
|
||||
do_s3_upload(self.s3, f, key)
|
||||
|
||||
try:
|
||||
key_thumb, thumb_index = get_thumbnails(
|
||||
|
|
|
@ -10,6 +10,7 @@ import math
|
|||
import threading
|
||||
from loguru import logger
|
||||
import archivers
|
||||
import requests
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
@ -43,7 +44,7 @@ def index_to_col(index):
|
|||
return alphabet[index]
|
||||
|
||||
|
||||
def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
|
||||
def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
|
||||
update = []
|
||||
|
||||
if columns['status'] is not None:
|
||||
|
@ -155,7 +156,6 @@ def process_sheet(sheet):
|
|||
columns['duration'] = index_to_col(headers.index(
|
||||
'duration')) if 'duration' in headers else None
|
||||
|
||||
|
||||
active_archivers = [
|
||||
archivers.TelegramArchiver(s3_client),
|
||||
archivers.TiktokArchiver(s3_client),
|
||||
|
@ -163,7 +163,6 @@ def process_sheet(sheet):
|
|||
archivers.WaybackArchiver(s3_client)
|
||||
]
|
||||
|
||||
|
||||
# loop through rows in worksheet
|
||||
for i in range(2, len(values)+1):
|
||||
v = values[i-1]
|
||||
|
@ -174,26 +173,35 @@ def process_sheet(sheet):
|
|||
|
||||
# check so we don't step on each others' toes
|
||||
if latest_val == '' or latest_val is None:
|
||||
wks.update(
|
||||
columns['status'] + str(i), 'Archive in progress')
|
||||
wks.update(columns['status'] + str(i),
|
||||
'Archive in progress')
|
||||
|
||||
for archiver in active_archivers:
|
||||
logger.debug(f"Trying {archiver} on row {i}")
|
||||
result = archiver.download(v[url_index], check_if_exists=True)
|
||||
|
||||
url = v[url_index]
|
||||
# expand short URL links
|
||||
if 'https://t.co/' in url:
|
||||
r = requests.get(url)
|
||||
url = r.url
|
||||
|
||||
result = archiver.download(url, check_if_exists=True)
|
||||
if result:
|
||||
logger.info(f"{archiver} succeeded on row {i}")
|
||||
break
|
||||
|
||||
if result:
|
||||
update_sheet(wks, i, result, columns, v)
|
||||
|
||||
else:
|
||||
wks.update(columns['status'] +
|
||||
str(i), 'failed: no archiver')
|
||||
|
||||
# except:
|
||||
# if any unexpected errors occured, log these into the Google Sheet
|
||||
# t, value, traceback = sys.exc_info()
|
||||
# if any unexpected errors occured, log these into the Google Sheet
|
||||
# t, value, traceback = sys.exc_info()
|
||||
|
||||
# update_sheet(wks, i, str(
|
||||
# value), {}, columns, v)
|
||||
# update_sheet(wks, i, str(
|
||||
# value), {}, columns, v)
|
||||
|
||||
|
||||
def main():
|
||||
|
|
Ładowanie…
Reference in New Issue