kopia lustrzana https://github.com/bellingcat/auto-archiver
Fix bugs in WaybackArchiver, follow redirects sometimes
rodzic
009c0dd8ca
commit
07b5d357b4
48
archivers.py
48
archivers.py
|
@ -210,7 +210,11 @@ class YoutubeDLArchiver(Archiver):
|
||||||
if 'entries' in info:
|
if 'entries' in info:
|
||||||
if len(info['entries']) > 1:
|
if len(info['entries']) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
|
'YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
|
||||||
|
return False
|
||||||
|
elif len(info['entries']) == 0:
|
||||||
|
logger.warning(
|
||||||
|
'YoutubeDLArchiver succeeded but did not find video')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
filename = ydl.prepare_filename(info['entries'][0])
|
filename = ydl.prepare_filename(info['entries'][0])
|
||||||
|
@ -257,13 +261,21 @@ class YoutubeDLArchiver(Archiver):
|
||||||
duration = info['duration'] if 'duration' in info else None
|
duration = info['duration'] if 'duration' in info else None
|
||||||
|
|
||||||
# get thumbnails
|
# get thumbnails
|
||||||
key_thumb, thumb_index = get_thumbnails(
|
try:
|
||||||
filename, self.s3, duration=duration)
|
key_thumb, thumb_index = get_thumbnails(
|
||||||
|
filename, self.s3, duration=duration)
|
||||||
|
except:
|
||||||
|
key_thumb = ''
|
||||||
|
thumb_index = 'Could not generate thumbnails'
|
||||||
|
|
||||||
os.remove(filename)
|
os.remove(filename)
|
||||||
|
|
||||||
|
timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(
|
||||||
|
info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
|
||||||
|
|
||||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
||||||
title=info['title'] if 'title' in info else None,
|
title=info['title'] if 'title' in info else None,
|
||||||
timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
|
timestamp=timestamp)
|
||||||
|
|
||||||
|
|
||||||
class WaybackArchiver(Archiver):
|
class WaybackArchiver(Archiver):
|
||||||
|
@ -286,6 +298,9 @@ class WaybackArchiver(Archiver):
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
return ArchiveResult(status="Internet archive failed")
|
return ArchiveResult(status="Internet archive failed")
|
||||||
|
|
||||||
|
if 'job_id' not in r.json() and 'message' in r.json():
|
||||||
|
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
|
||||||
|
|
||||||
job_id = r.json()['job_id']
|
job_id = r.json()['job_id']
|
||||||
|
|
||||||
status_r = requests.get(
|
status_r = requests.get(
|
||||||
|
@ -311,7 +326,7 @@ class WaybackArchiver(Archiver):
|
||||||
status_json = status_r.json()
|
status_json = status_r.json()
|
||||||
|
|
||||||
if status_json['status'] != 'success':
|
if status_json['status'] != 'success':
|
||||||
return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
|
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
|
||||||
|
|
||||||
archive_url = 'https://web.archive.org/web/' + \
|
archive_url = 'https://web.archive.org/web/' + \
|
||||||
status_json['timestamp'] + '/' + status_json['original_url']
|
status_json['timestamp'] + '/' + status_json['original_url']
|
||||||
|
@ -324,6 +339,9 @@ class WaybackArchiver(Archiver):
|
||||||
|
|
||||||
title = parsed.find_all('title')[
|
title = parsed.find_all('title')[
|
||||||
0].text
|
0].text
|
||||||
|
|
||||||
|
if title == 'Wayback Machine':
|
||||||
|
title = 'Could not get title'
|
||||||
except:
|
except:
|
||||||
title = "Could not get title"
|
title = "Could not get title"
|
||||||
|
|
||||||
|
@ -343,6 +361,7 @@ class TiktokArchiver(Archiver):
|
||||||
try:
|
try:
|
||||||
info = tiktok_downloader.info_post(url)
|
info = tiktok_downloader.info_post(url)
|
||||||
key = 'tiktok_' + str(info.id) + '.mp4'
|
key = 'tiktok_' + str(info.id) + '.mp4'
|
||||||
|
cdn_url = get_cdn_url(key)
|
||||||
filename = 'tmp/' + key
|
filename = 'tmp/' + key
|
||||||
|
|
||||||
if check_if_exists:
|
if check_if_exists:
|
||||||
|
@ -357,16 +376,19 @@ class TiktokArchiver(Archiver):
|
||||||
except ClientError:
|
except ClientError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if status != 'already archived':
|
media = tiktok_downloader.snaptik(url).get_media()
|
||||||
media = tiktok_downloader.snaptik(url).get_media()
|
|
||||||
if len(media) > 0:
|
|
||||||
media[0].download(filename)
|
|
||||||
with open(filename, 'rb') as f:
|
|
||||||
do_s3_upload(self.s3, f, key)
|
|
||||||
|
|
||||||
cdn_url = get_cdn_url(key)
|
if len(media) <= 0:
|
||||||
|
if status == 'already archived':
|
||||||
|
return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
|
||||||
else:
|
else:
|
||||||
status = 'could not download media'
|
return ArchiveResult(status='Could not download media')
|
||||||
|
|
||||||
|
media[0].download(filename)
|
||||||
|
|
||||||
|
if status != 'already archived':
|
||||||
|
with open(filename, 'rb') as f:
|
||||||
|
do_s3_upload(self.s3, f, key)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
key_thumb, thumb_index = get_thumbnails(
|
key_thumb, thumb_index = get_thumbnails(
|
||||||
|
|
|
@ -10,6 +10,7 @@ import math
|
||||||
import threading
|
import threading
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import archivers
|
import archivers
|
||||||
|
import requests
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
@ -43,7 +44,7 @@ def index_to_col(index):
|
||||||
return alphabet[index]
|
return alphabet[index]
|
||||||
|
|
||||||
|
|
||||||
def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
|
def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
|
||||||
update = []
|
update = []
|
||||||
|
|
||||||
if columns['status'] is not None:
|
if columns['status'] is not None:
|
||||||
|
@ -155,7 +156,6 @@ def process_sheet(sheet):
|
||||||
columns['duration'] = index_to_col(headers.index(
|
columns['duration'] = index_to_col(headers.index(
|
||||||
'duration')) if 'duration' in headers else None
|
'duration')) if 'duration' in headers else None
|
||||||
|
|
||||||
|
|
||||||
active_archivers = [
|
active_archivers = [
|
||||||
archivers.TelegramArchiver(s3_client),
|
archivers.TelegramArchiver(s3_client),
|
||||||
archivers.TiktokArchiver(s3_client),
|
archivers.TiktokArchiver(s3_client),
|
||||||
|
@ -163,7 +163,6 @@ def process_sheet(sheet):
|
||||||
archivers.WaybackArchiver(s3_client)
|
archivers.WaybackArchiver(s3_client)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# loop through rows in worksheet
|
# loop through rows in worksheet
|
||||||
for i in range(2, len(values)+1):
|
for i in range(2, len(values)+1):
|
||||||
v = values[i-1]
|
v = values[i-1]
|
||||||
|
@ -174,26 +173,35 @@ def process_sheet(sheet):
|
||||||
|
|
||||||
# check so we don't step on each others' toes
|
# check so we don't step on each others' toes
|
||||||
if latest_val == '' or latest_val is None:
|
if latest_val == '' or latest_val is None:
|
||||||
wks.update(
|
wks.update(columns['status'] + str(i),
|
||||||
columns['status'] + str(i), 'Archive in progress')
|
'Archive in progress')
|
||||||
|
|
||||||
for archiver in active_archivers:
|
for archiver in active_archivers:
|
||||||
logger.debug(f"Trying {archiver} on row {i}")
|
logger.debug(f"Trying {archiver} on row {i}")
|
||||||
result = archiver.download(v[url_index], check_if_exists=True)
|
|
||||||
|
url = v[url_index]
|
||||||
|
# expand short URL links
|
||||||
|
if 'https://t.co/' in url:
|
||||||
|
r = requests.get(url)
|
||||||
|
url = r.url
|
||||||
|
|
||||||
|
result = archiver.download(url, check_if_exists=True)
|
||||||
if result:
|
if result:
|
||||||
logger.info(f"{archiver} succeeded on row {i}")
|
logger.info(f"{archiver} succeeded on row {i}")
|
||||||
break
|
break
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
update_sheet(wks, i, result, columns, v)
|
update_sheet(wks, i, result, columns, v)
|
||||||
|
else:
|
||||||
|
wks.update(columns['status'] +
|
||||||
|
str(i), 'failed: no archiver')
|
||||||
|
|
||||||
# except:
|
# except:
|
||||||
# if any unexpected errors occured, log these into the Google Sheet
|
# if any unexpected errors occured, log these into the Google Sheet
|
||||||
# t, value, traceback = sys.exc_info()
|
# t, value, traceback = sys.exc_info()
|
||||||
|
|
||||||
# update_sheet(wks, i, str(
|
# update_sheet(wks, i, str(
|
||||||
# value), {}, columns, v)
|
# value), {}, columns, v)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
Ładowanie…
Reference in New Issue