kopia lustrzana https://github.com/bellingcat/auto-archiver
refactoring storage and bringing changes from origin
rodzic
f3ce226665
commit
e4603a9423
|
@ -0,0 +1 @@
|
||||||
|
from storages import *
|
|
@ -1,17 +1,10 @@
|
||||||
import os
|
import os
|
||||||
import ffmpeg
|
import ffmpeg
|
||||||
from dataclasses import dataclass
|
|
||||||
import datetime
|
import datetime
|
||||||
from loguru import logger
|
from dataclasses import dataclass
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
# TODO There should be a better way of generating keys, that adds the following info:
|
from storages import Storage
|
||||||
# - name of sheet that it is being archived from
|
|
||||||
# (this means we might archive the same media twice on different sheets, but that's OK I think)
|
|
||||||
# - name of archiver/platform that the video comes from
|
|
||||||
# This should make it easier to maintain and clean the archive later
|
|
||||||
|
|
||||||
# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
|
|
||||||
# cleaned up? Difficult is we don't know the filename until the archivers start working.
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -25,33 +18,27 @@ class ArchiveResult:
|
||||||
timestamp: datetime.datetime = None
|
timestamp: datetime.datetime = None
|
||||||
|
|
||||||
|
|
||||||
class Archiver:
|
class Archiver(ABC):
|
||||||
name = "default"
|
name = "default"
|
||||||
|
|
||||||
def __init__(self, s3_client):
|
def __init__(self, storage: Storage):
|
||||||
self.s3 = s3_client
|
self.storage = storage
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__class__.__name__
|
return self.__class__.__name__
|
||||||
|
|
||||||
def download(self, url, check_if_exists=False):
|
@abstractmethod
|
||||||
logger.error("method 'download' not implemented")
|
def download(self, url, check_if_exists=False): pass
|
||||||
|
|
||||||
def get_cdn_url(self, key):
|
|
||||||
return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
|
|
||||||
os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
|
|
||||||
|
|
||||||
def do_s3_upload(self, f, key):
|
|
||||||
self.s3.upload_fileobj(f, Bucket=os.getenv(
|
|
||||||
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
|
|
||||||
|
|
||||||
def get_key(self, filename):
|
def get_key(self, filename):
|
||||||
print(f"key base implementation: {self.name}")
|
"""
|
||||||
# TODO: refactor to be more manageable
|
returns a key in the format "[archiverName]_[filename]" includes extension
|
||||||
key = filename.split('/')[1]
|
"""
|
||||||
if 'unknown_video' in key:
|
tail = os.path.split(filename)[1] # returns filename.ext from full path
|
||||||
key = key.replace('unknown_video', 'jpg')
|
_id, extension = os.path.splitext(tail) # returns [filename, .ext]
|
||||||
return key
|
if 'unknown_video' in _id:
|
||||||
|
_id = _id.replace('unknown_video', 'jpg')
|
||||||
|
return f'{self.name}_{_id}{extension}'
|
||||||
|
|
||||||
def get_thumbnails(self, filename, duration=None):
|
def get_thumbnails(self, filename, duration=None):
|
||||||
if not os.path.exists(filename.split('.')[0]):
|
if not os.path.exists(filename.split('.')[0]):
|
||||||
|
@ -80,10 +67,9 @@ class Archiver:
|
||||||
thumbnail_filename = filename.split('.')[0] + '/' + fname
|
thumbnail_filename = filename.split('.')[0] + '/' + fname
|
||||||
key = filename.split('/')[1].split('.')[0] + '/' + fname
|
key = filename.split('/')[1].split('.')[0] + '/' + fname
|
||||||
|
|
||||||
cdn_url = self.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
|
|
||||||
with open(thumbnail_filename, 'rb') as f:
|
self.storage.upload(thumbnail_filename, key)
|
||||||
self.do_s3_upload(f, key)
|
|
||||||
|
|
||||||
cdn_urls.append(cdn_url)
|
cdn_urls.append(cdn_url)
|
||||||
os.remove(thumbnail_filename)
|
os.remove(thumbnail_filename)
|
||||||
|
@ -107,9 +93,8 @@ class Archiver:
|
||||||
|
|
||||||
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
|
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
|
||||||
|
|
||||||
self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
|
self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
|
||||||
'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
|
|
||||||
|
|
||||||
thumb_index_cdn_url = self.get_cdn_url(thumb_index)
|
thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
|
||||||
|
|
||||||
return (key_thumb, thumb_index_cdn_url)
|
return (key_thumb, thumb_index_cdn_url)
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from botocore.errorfactory import ClientError
|
|
||||||
from .base_archiver import Archiver, ArchiveResult
|
|
||||||
|
|
||||||
# TODO: get_cdn_url, get_thumbnails, do_s3_upload
|
from .base_archiver import Archiver, ArchiveResult
|
||||||
|
|
||||||
|
|
||||||
class TelegramArchiver(Archiver):
|
class TelegramArchiver(Archiver):
|
||||||
|
@ -35,19 +33,13 @@ class TelegramArchiver(Archiver):
|
||||||
|
|
||||||
video_url = video.get('src')
|
video_url = video.get('src')
|
||||||
key = video_url.split('/')[-1].split('?')[0]
|
key = video_url.split('/')[-1].split('?')[0]
|
||||||
|
key = self.get_key(key)
|
||||||
|
|
||||||
filename = 'tmp/' + key
|
filename = 'tmp/' + key
|
||||||
|
|
||||||
if check_if_exists:
|
if check_if_exists and self.storage.exists(key):
|
||||||
try:
|
|
||||||
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
|
|
||||||
|
|
||||||
# file exists
|
|
||||||
cdn_url = self.get_cdn_url(key)
|
|
||||||
|
|
||||||
status = 'already archived'
|
status = 'already archived'
|
||||||
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
except ClientError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
v = requests.get(video_url, headers=headers)
|
v = requests.get(video_url, headers=headers)
|
||||||
|
|
||||||
|
@ -55,10 +47,9 @@ class TelegramArchiver(Archiver):
|
||||||
f.write(v.content)
|
f.write(v.content)
|
||||||
|
|
||||||
if status != 'already archived':
|
if status != 'already archived':
|
||||||
cdn_url = self.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
|
|
||||||
with open(filename, 'rb') as f:
|
self.storage.upload(filename, key)
|
||||||
self.do_s3_upload(f, key)
|
|
||||||
|
|
||||||
# extract duration from HTML
|
# extract duration from HTML
|
||||||
duration = s.find_all('time')[0].contents[0]
|
duration = s.find_all('time')[0].contents[0]
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
import os, traceback
|
import os, traceback
|
||||||
from botocore.errorfactory import ClientError
|
|
||||||
import tiktok_downloader
|
import tiktok_downloader
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from .base_archiver import Archiver, ArchiveResult
|
|
||||||
|
|
||||||
# TODO: get_cdn_url, do_s3_upload, get_thumbnails
|
from .base_archiver import Archiver, ArchiveResult
|
||||||
|
|
||||||
|
|
||||||
class TiktokArchiver(Archiver):
|
class TiktokArchiver(Archiver):
|
||||||
|
@ -18,35 +16,28 @@ class TiktokArchiver(Archiver):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
info = tiktok_downloader.info_post(url)
|
info = tiktok_downloader.info_post(url)
|
||||||
key = 'tiktok_' + str(info.id) + '.mp4'
|
key = self.get_key(f'{info.id}.mp4')
|
||||||
|
cdn_url = self.get_cdn_url(key)
|
||||||
filename = 'tmp/' + key
|
filename = 'tmp/' + key
|
||||||
|
|
||||||
if check_if_exists:
|
if check_if_exists and self.storage.exists(key):
|
||||||
try:
|
|
||||||
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
|
|
||||||
|
|
||||||
# file exists
|
|
||||||
cdn_url = self.get_cdn_url(key)
|
|
||||||
|
|
||||||
status = 'already archived'
|
status = 'already archived'
|
||||||
|
|
||||||
except ClientError:
|
media = tiktok_downloader.snaptik(url).get_media()
|
||||||
pass
|
|
||||||
|
if len(media) <= 0:
|
||||||
|
if status == 'already archived':
|
||||||
|
return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
|
||||||
|
else:
|
||||||
|
return ArchiveResult(status='Could not download media')
|
||||||
|
|
||||||
|
media[0].download(filename)
|
||||||
|
|
||||||
if status != 'already archived':
|
if status != 'already archived':
|
||||||
media = tiktok_downloader.snaptik(url).get_media()
|
self.storage.upload(filename, key)
|
||||||
if len(media) > 0:
|
|
||||||
media[0].download(filename)
|
|
||||||
with open(filename, 'rb') as f:
|
|
||||||
self.do_s3_upload(f, key)
|
|
||||||
|
|
||||||
cdn_url = self.get_cdn_url(key)
|
|
||||||
else:
|
|
||||||
status = 'could not download media'
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
key_thumb, thumb_index = self.get_thumbnails(
|
key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration)
|
||||||
filename, duration=info.duration)
|
|
||||||
except:
|
except:
|
||||||
key_thumb = ''
|
key_thumb = ''
|
||||||
thumb_index = 'error creating thumbnails'
|
thumb_index = 'error creating thumbnails'
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
import time, requests, os
|
import time, requests, os
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from storages import Storage
|
||||||
from .base_archiver import Archiver, ArchiveResult
|
from .base_archiver import Archiver, ArchiveResult
|
||||||
|
|
||||||
|
|
||||||
class WaybackArchiver(Archiver):
|
class WaybackArchiver(Archiver):
|
||||||
name = "wayback"
|
name = "wayback"
|
||||||
|
|
||||||
def __init__(self, s3_client):
|
def __init__(self, storage: Storage):
|
||||||
self.s3 = s3_client
|
super(WaybackArchiver, self).__init__(storage)
|
||||||
self.seen_urls = {}
|
self.seen_urls = {}
|
||||||
|
|
||||||
def download(self, url, check_if_exists=False):
|
def download(self, url, check_if_exists=False):
|
||||||
|
@ -26,10 +27,12 @@ class WaybackArchiver(Archiver):
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
return ArchiveResult(status="Internet archive failed")
|
return ArchiveResult(status="Internet archive failed")
|
||||||
|
|
||||||
|
if 'job_id' not in r.json() and 'message' in r.json():
|
||||||
|
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
|
||||||
|
|
||||||
job_id = r.json()['job_id']
|
job_id = r.json()['job_id']
|
||||||
|
|
||||||
status_r = requests.get(
|
status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers)
|
||||||
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
|
|
||||||
|
|
||||||
retries = 0
|
retries = 0
|
||||||
|
|
||||||
|
@ -51,7 +54,7 @@ class WaybackArchiver(Archiver):
|
||||||
status_json = status_r.json()
|
status_json = status_r.json()
|
||||||
|
|
||||||
if status_json['status'] != 'success':
|
if status_json['status'] != 'success':
|
||||||
return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
|
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
|
||||||
|
|
||||||
archive_url = 'https://web.archive.org/web/' + \
|
archive_url = 'https://web.archive.org/web/' + \
|
||||||
status_json['timestamp'] + '/' + status_json['original_url']
|
status_json['timestamp'] + '/' + status_json['original_url']
|
||||||
|
@ -59,15 +62,15 @@ class WaybackArchiver(Archiver):
|
||||||
try:
|
try:
|
||||||
r = requests.get(archive_url)
|
r = requests.get(archive_url)
|
||||||
|
|
||||||
parsed = BeautifulSoup(
|
parsed = BeautifulSoup(r.content, 'html.parser')
|
||||||
r.content, 'html.parser')
|
|
||||||
|
|
||||||
title = parsed.find_all('title')[
|
title = parsed.find_all('title')[0].text
|
||||||
0].text
|
|
||||||
|
if title == 'Wayback Machine':
|
||||||
|
title = 'Could not get title'
|
||||||
except:
|
except:
|
||||||
title = "Could not get title"
|
title = "Could not get title"
|
||||||
|
|
||||||
result = ArchiveResult(
|
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title)
|
||||||
status='Internet Archive fallback', cdn_url=archive_url, title=title)
|
|
||||||
self.seen_urls[url] = result
|
self.seen_urls[url] = result
|
||||||
return result
|
return result
|
||||||
|
|
|
@ -3,9 +3,10 @@ import os
|
||||||
import datetime
|
import datetime
|
||||||
import youtube_dl
|
import youtube_dl
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from botocore.errorfactory import ClientError
|
|
||||||
from .base_archiver import Archiver, ArchiveResult
|
from .base_archiver import Archiver, ArchiveResult
|
||||||
|
|
||||||
|
|
||||||
class YoutubeDLArchiver(Archiver):
|
class YoutubeDLArchiver(Archiver):
|
||||||
name = "yotube_dl"
|
name = "yotube_dl"
|
||||||
|
|
||||||
|
@ -32,8 +33,11 @@ class YoutubeDLArchiver(Archiver):
|
||||||
if check_if_exists:
|
if check_if_exists:
|
||||||
if 'entries' in info:
|
if 'entries' in info:
|
||||||
if len(info['entries']) > 1:
|
if len(info['entries']) > 1:
|
||||||
|
logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
|
||||||
|
return False
|
||||||
|
elif len(info['entries']) == 0:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
|
'YoutubeDLArchiver succeeded but did not find video')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
filename = ydl.prepare_filename(info['entries'][0])
|
filename = ydl.prepare_filename(info['entries'][0])
|
||||||
|
@ -42,20 +46,14 @@ class YoutubeDLArchiver(Archiver):
|
||||||
|
|
||||||
key = self.get_key(filename)
|
key = self.get_key(filename)
|
||||||
|
|
||||||
try:
|
if self.storage.exists(key):
|
||||||
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
|
|
||||||
|
|
||||||
# file exists
|
|
||||||
cdn_url = self.get_cdn_url(key)
|
|
||||||
|
|
||||||
status = 'already archived'
|
status = 'already archived'
|
||||||
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
except ClientError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# sometimes this results in a different filename, so do this again
|
# sometimes this results in a different filename, so do this again
|
||||||
info = ydl.extract_info(url, download=True)
|
info = ydl.extract_info(url, download=True)
|
||||||
|
|
||||||
|
# TODO: add support for multiple videos
|
||||||
if 'entries' in info:
|
if 'entries' in info:
|
||||||
if len(info['entries']) > 1:
|
if len(info['entries']) > 1:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
@ -71,18 +69,23 @@ class YoutubeDLArchiver(Archiver):
|
||||||
|
|
||||||
if status != 'already archived':
|
if status != 'already archived':
|
||||||
key = self.get_key(filename)
|
key = self.get_key(filename)
|
||||||
cdn_url = self.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
|
|
||||||
with open(filename, 'rb') as f:
|
self.storage.upload(filename, key)
|
||||||
self.do_s3_upload(f, key)
|
|
||||||
|
|
||||||
# get duration
|
# get duration
|
||||||
duration = info['duration'] if 'duration' in info else None
|
duration = info['duration'] if 'duration' in info else None
|
||||||
|
|
||||||
# get thumbnails
|
# get thumbnails
|
||||||
|
try:
|
||||||
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
|
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
|
||||||
|
except:
|
||||||
|
key_thumb = ''
|
||||||
|
thumb_index = 'Could not generate thumbnails'
|
||||||
|
|
||||||
os.remove(filename)
|
os.remove(filename)
|
||||||
|
|
||||||
|
timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
|
||||||
|
|
||||||
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
|
||||||
title=info['title'] if 'title' in info else None,
|
title=info['title'] if 'title' in info else None, timestamp=timestamp)
|
||||||
timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
|
|
||||||
|
|
|
@ -2,12 +2,13 @@ import os
|
||||||
import datetime
|
import datetime
|
||||||
import argparse
|
import argparse
|
||||||
import math
|
import math
|
||||||
|
import requests
|
||||||
import gspread
|
import gspread
|
||||||
import boto3
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
import archivers
|
import archivers
|
||||||
|
from storages import S3Storage, S3Config
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
@ -103,19 +104,24 @@ def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
|
||||||
def process_sheet(sheet):
|
def process_sheet(sheet):
|
||||||
gc = gspread.service_account(filename='service_account.json')
|
gc = gspread.service_account(filename='service_account.json')
|
||||||
sh = gc.open(sheet)
|
sh = gc.open(sheet)
|
||||||
n_worksheets = len(sh.worksheets())
|
|
||||||
|
|
||||||
s3_client = boto3.client('s3',
|
s3_config = S3Config(
|
||||||
region_name=os.getenv('DO_SPACES_REGION'),
|
bucket=os.getenv('DO_BUCKET'),
|
||||||
endpoint_url='https://{}.digitaloceanspaces.com'.format(
|
region=os.getenv('DO_SPACES_REGION'),
|
||||||
os.getenv('DO_SPACES_REGION')),
|
key=os.getenv('DO_SPACES_KEY'),
|
||||||
aws_access_key_id=os.getenv('DO_SPACES_KEY'),
|
secret=os.getenv('DO_SPACES_SECRET')
|
||||||
aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
|
)
|
||||||
|
|
||||||
|
# s3_client = boto3.client('s3',
|
||||||
|
# region_name=os.getenv('DO_SPACES_REGION'),
|
||||||
|
# endpoint_url='https://{}.digitaloceanspaces.com'.format(
|
||||||
|
# os.getenv('DO_SPACES_REGION')),
|
||||||
|
# aws_access_key_id=os.getenv('DO_SPACES_KEY'),
|
||||||
|
# aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
|
||||||
|
|
||||||
# loop through worksheets to check
|
# loop through worksheets to check
|
||||||
for ii in range(n_worksheets):
|
for ii, wks in enumerate(sh.worksheets()):
|
||||||
logger.info("Opening worksheet " + str(ii))
|
logger.info(f'Opening worksheet {ii}: "{wks.title}"')
|
||||||
wks = sh.get_worksheet(ii)
|
|
||||||
values = wks.get_all_values()
|
values = wks.get_all_values()
|
||||||
|
|
||||||
headers = [v.lower() for v in values[0]]
|
headers = [v.lower() for v in values[0]]
|
||||||
|
@ -126,7 +132,7 @@ def process_sheet(sheet):
|
||||||
'source url')) if 'source url' in headers else None
|
'source url')) if 'source url' in headers else None
|
||||||
|
|
||||||
if columns['url'] is None:
|
if columns['url'] is None:
|
||||||
logger.warning("No 'Media URL' column found, skipping")
|
logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
url_index = col_to_index(columns['url'])
|
url_index = col_to_index(columns['url'])
|
||||||
|
@ -153,6 +159,9 @@ def process_sheet(sheet):
|
||||||
columns['duration'] = index_to_col(headers.index(
|
columns['duration'] = index_to_col(headers.index(
|
||||||
'duration')) if 'duration' in headers else None
|
'duration')) if 'duration' in headers else None
|
||||||
|
|
||||||
|
# archives will be in a folder 'doc_name/worksheet_name'
|
||||||
|
s3_config.folder = f'{sheet}/{wks.title}/'
|
||||||
|
s3_client = S3Storage(s3_config)
|
||||||
|
|
||||||
# order matters, first to succeed excludes remaining
|
# order matters, first to succeed excludes remaining
|
||||||
active_archivers = [
|
active_archivers = [
|
||||||
|
@ -162,30 +171,36 @@ def process_sheet(sheet):
|
||||||
archivers.WaybackArchiver(s3_client)
|
archivers.WaybackArchiver(s3_client)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# loop through rows in worksheet
|
# loop through rows in worksheet
|
||||||
for i in range(2, len(values) + 1):
|
for i in range(2, len(values) + 1):
|
||||||
v = values[i - 1]
|
v = values[i - 1]
|
||||||
|
url = v[url_index]
|
||||||
|
|
||||||
if v[url_index] != "" and v[col_to_index(columns['status'])] == "":
|
if url != "" and v[col_to_index(columns['status'])] == "":
|
||||||
latest_val = wks.acell(
|
latest_val = wks.acell(columns['status'] + str(i)).value
|
||||||
columns['status'] + str(i)).value
|
|
||||||
|
|
||||||
# check so we don't step on each others' toes
|
# check so we don't step on each others' toes
|
||||||
if latest_val == '' or latest_val is None:
|
if latest_val == '' or latest_val is None:
|
||||||
wks.update(
|
wks.update(columns['status'] + str(i), 'Archive in progress')
|
||||||
columns['status'] + str(i), 'Archive in progress')
|
|
||||||
|
# expand short URL links
|
||||||
|
if 'https://t.co/' in url:
|
||||||
|
r = requests.get(url)
|
||||||
|
url = r.url
|
||||||
|
|
||||||
for archiver in active_archivers:
|
for archiver in active_archivers:
|
||||||
logger.debug(f"Trying {archiver} on row {i}")
|
logger.debug(f"Trying {archiver} on row {i}")
|
||||||
result = archiver.download(v[url_index], check_if_exists=True)
|
|
||||||
|
result = archiver.download(url, check_if_exists=True)
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
logger.info(f"{archiver} succeeded on row {i}")
|
logger.success(f"{archiver} succeeded on row {i}")
|
||||||
break
|
break
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
update_sheet(wks, i, result, columns, v)
|
update_sheet(wks, i, result, columns, v)
|
||||||
|
else:
|
||||||
|
wks.update(columns['status'] + str(i), 'failed: no archiver')
|
||||||
|
|
||||||
# except:
|
# except:
|
||||||
# if any unexpected errors occured, log these into the Google Sheet
|
# if any unexpected errors occured, log these into the Google Sheet
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
# we need to explicitly expose the available imports here
|
||||||
|
from .base_storage import *
|
||||||
|
from .s3_storage import *
|
|
@ -0,0 +1,19 @@
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
|
||||||
|
class Storage(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def __init__(self, config): pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_cdn_url(self, path): pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def exists(self, path): pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def uploadf(self, file, key, **kwargs): pass
|
||||||
|
|
||||||
|
def upload(self, filename: str, key: str, **kwargs):
|
||||||
|
with open(filename, 'rb') as f:
|
||||||
|
self.uploadf(f, key, **kwargs)
|
|
@ -0,0 +1,49 @@
|
||||||
|
import boto3
|
||||||
|
from botocore.errorfactory import ClientError
|
||||||
|
from .base_storage import Storage
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class S3Config:
|
||||||
|
bucket: str
|
||||||
|
region: str
|
||||||
|
key: str
|
||||||
|
secret: str
|
||||||
|
folder: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class S3Storage(Storage):
|
||||||
|
|
||||||
|
def __init__(self, config: S3Config):
|
||||||
|
self.bucket = config.bucket
|
||||||
|
self.region = config.region
|
||||||
|
self.folder = config.folder
|
||||||
|
|
||||||
|
if len(self.folder) and self.folder[-1] != '/':
|
||||||
|
self.folder += '/'
|
||||||
|
|
||||||
|
self.s3 = boto3.client(
|
||||||
|
's3',
|
||||||
|
region_name=self.region,
|
||||||
|
endpoint_url=f'https://{self.region}.digitaloceanspaces.com',
|
||||||
|
aws_access_key_id=config.key,
|
||||||
|
aws_secret_access_key=config.secret
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_path(self, key):
|
||||||
|
return self.folder + key
|
||||||
|
|
||||||
|
def get_cdn_url(self, key):
|
||||||
|
return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
|
||||||
|
|
||||||
|
def exists(self, key):
|
||||||
|
try:
|
||||||
|
self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
|
||||||
|
return True
|
||||||
|
except ClientError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def uploadf(self, file, key, **kwargs):
|
||||||
|
extra_args = kwargs["extra_args"] if "extra_args" in kwargs else {'ACL': 'public-read'}
|
||||||
|
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)
|
Ładowanie…
Reference in New Issue