refactoring storage and bringing changes from origin

pull/13/head
msramalho 2022-02-22 16:03:35 +01:00
rodzic f3ce226665
commit e4603a9423
10 zmienionych plików z 197 dodań i 137 usunięć

1
__init__.py 100644
Wyświetl plik

@ -0,0 +1 @@
from storages import *

Wyświetl plik

@ -1,17 +1,10 @@
import os import os
import ffmpeg import ffmpeg
from dataclasses import dataclass
import datetime import datetime
from loguru import logger from dataclasses import dataclass
from abc import ABC, abstractmethod
# TODO There should be a better way of generating keys, that adds the following info: from storages import Storage
# - name of sheet that it is being archived from
# (this means we might archive the same media twice on different sheets, but that's OK I think)
# - name of archiver/platform that the video comes from
# This should make it easier to maintain and clean the archive later
# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
# cleaned up? Difficult is we don't know the filename until the archivers start working.
@dataclass @dataclass
@ -25,33 +18,27 @@ class ArchiveResult:
timestamp: datetime.datetime = None timestamp: datetime.datetime = None
class Archiver: class Archiver(ABC):
name = "default" name = "default"
def __init__(self, s3_client): def __init__(self, storage: Storage):
self.s3 = s3_client self.storage = storage
def __str__(self): def __str__(self):
return self.__class__.__name__ return self.__class__.__name__
def download(self, url, check_if_exists=False): @abstractmethod
logger.error("method 'download' not implemented") def download(self, url, check_if_exists=False): pass
def get_cdn_url(self, key):
return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
def do_s3_upload(self, f, key):
self.s3.upload_fileobj(f, Bucket=os.getenv(
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
def get_key(self, filename): def get_key(self, filename):
print(f"key base implementation: {self.name}") """
# TODO: refactor to be more manageable returns a key in the format "[archiverName]_[filename]" includes extension
key = filename.split('/')[1] """
if 'unknown_video' in key: tail = os.path.split(filename)[1] # returns filename.ext from full path
key = key.replace('unknown_video', 'jpg') _id, extension = os.path.splitext(tail) # returns [filename, .ext]
return key if 'unknown_video' in _id:
_id = _id.replace('unknown_video', 'jpg')
return f'{self.name}_{_id}{extension}'
def get_thumbnails(self, filename, duration=None): def get_thumbnails(self, filename, duration=None):
if not os.path.exists(filename.split('.')[0]): if not os.path.exists(filename.split('.')[0]):
@ -80,10 +67,9 @@ class Archiver:
thumbnail_filename = filename.split('.')[0] + '/' + fname thumbnail_filename = filename.split('.')[0] + '/' + fname
key = filename.split('/')[1].split('.')[0] + '/' + fname key = filename.split('/')[1].split('.')[0] + '/' + fname
cdn_url = self.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)
with open(thumbnail_filename, 'rb') as f: self.storage.upload(thumbnail_filename, key)
self.do_s3_upload(f, key)
cdn_urls.append(cdn_url) cdn_urls.append(cdn_url)
os.remove(thumbnail_filename) os.remove(thumbnail_filename)
@ -107,9 +93,8 @@ class Archiver:
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv( self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
thumb_index_cdn_url = self.get_cdn_url(thumb_index) thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
return (key_thumb, thumb_index_cdn_url) return (key_thumb, thumb_index_cdn_url)

Wyświetl plik

@ -1,15 +1,13 @@
import os import os
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from botocore.errorfactory import ClientError
from .base_archiver import Archiver, ArchiveResult
# TODO: get_cdn_url, get_thumbnails, do_s3_upload from .base_archiver import Archiver, ArchiveResult
class TelegramArchiver(Archiver): class TelegramArchiver(Archiver):
name = "telegram" name = "telegram"
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
# detect URLs that we definitely cannot handle # detect URLs that we definitely cannot handle
if 'http://t.me/' not in url and 'https://t.me/' not in url: if 'http://t.me/' not in url and 'https://t.me/' not in url:
@ -35,19 +33,13 @@ class TelegramArchiver(Archiver):
video_url = video.get('src') video_url = video.get('src')
key = video_url.split('/')[-1].split('?')[0] key = video_url.split('/')[-1].split('?')[0]
key = self.get_key(key)
filename = 'tmp/' + key filename = 'tmp/' + key
if check_if_exists: if check_if_exists and self.storage.exists(key):
try: status = 'already archived'
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) cdn_url = self.storage.get_cdn_url(key)
# file exists
cdn_url = self.get_cdn_url(key)
status = 'already archived'
except ClientError:
pass
v = requests.get(video_url, headers=headers) v = requests.get(video_url, headers=headers)
@ -55,10 +47,9 @@ class TelegramArchiver(Archiver):
f.write(v.content) f.write(v.content)
if status != 'already archived': if status != 'already archived':
cdn_url = self.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)
with open(filename, 'rb') as f: self.storage.upload(filename, key)
self.do_s3_upload(f, key)
# extract duration from HTML # extract duration from HTML
duration = s.find_all('time')[0].contents[0] duration = s.find_all('time')[0].contents[0]

Wyświetl plik

@ -1,15 +1,13 @@
import os, traceback import os, traceback
from botocore.errorfactory import ClientError
import tiktok_downloader import tiktok_downloader
from loguru import logger from loguru import logger
from .base_archiver import Archiver, ArchiveResult
# TODO: get_cdn_url, do_s3_upload, get_thumbnails from .base_archiver import Archiver, ArchiveResult
class TiktokArchiver(Archiver): class TiktokArchiver(Archiver):
name = "tiktok" name = "tiktok"
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
if 'tiktok.com' not in url: if 'tiktok.com' not in url:
return False return False
@ -18,35 +16,28 @@ class TiktokArchiver(Archiver):
try: try:
info = tiktok_downloader.info_post(url) info = tiktok_downloader.info_post(url)
key = 'tiktok_' + str(info.id) + '.mp4' key = self.get_key(f'{info.id}.mp4')
cdn_url = self.get_cdn_url(key)
filename = 'tmp/' + key filename = 'tmp/' + key
if check_if_exists: if check_if_exists and self.storage.exists(key):
try: status = 'already archived'
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists media = tiktok_downloader.snaptik(url).get_media()
cdn_url = self.get_cdn_url(key)
status = 'already archived' if len(media) <= 0:
if status == 'already archived':
return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
else:
return ArchiveResult(status='Could not download media')
except ClientError: media[0].download(filename)
pass
if status != 'already archived': if status != 'already archived':
media = tiktok_downloader.snaptik(url).get_media() self.storage.upload(filename, key)
if len(media) > 0:
media[0].download(filename)
with open(filename, 'rb') as f:
self.do_s3_upload(f, key)
cdn_url = self.get_cdn_url(key)
else:
status = 'could not download media'
try: try:
key_thumb, thumb_index = self.get_thumbnails( key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration)
filename, duration=info.duration)
except: except:
key_thumb = '' key_thumb = ''
thumb_index = 'error creating thumbnails' thumb_index = 'error creating thumbnails'

Wyświetl plik

@ -1,14 +1,15 @@
import time, requests, os import time, requests, os
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from storages import Storage
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
class WaybackArchiver(Archiver): class WaybackArchiver(Archiver):
name = "wayback" name = "wayback"
def __init__(self, s3_client): def __init__(self, storage: Storage):
self.s3 = s3_client super(WaybackArchiver, self).__init__(storage)
self.seen_urls = {} self.seen_urls = {}
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
@ -26,10 +27,12 @@ class WaybackArchiver(Archiver):
if r.status_code != 200: if r.status_code != 200:
return ArchiveResult(status="Internet archive failed") return ArchiveResult(status="Internet archive failed")
if 'job_id' not in r.json() and 'message' in r.json():
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
job_id = r.json()['job_id'] job_id = r.json()['job_id']
status_r = requests.get( status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers)
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
retries = 0 retries = 0
@ -51,7 +54,7 @@ class WaybackArchiver(Archiver):
status_json = status_r.json() status_json = status_r.json()
if status_json['status'] != 'success': if status_json['status'] != 'success':
return ArchiveResult(status='Internet Archive failed: ' + status_json['message']) return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
archive_url = 'https://web.archive.org/web/' + \ archive_url = 'https://web.archive.org/web/' + \
status_json['timestamp'] + '/' + status_json['original_url'] status_json['timestamp'] + '/' + status_json['original_url']
@ -59,15 +62,15 @@ class WaybackArchiver(Archiver):
try: try:
r = requests.get(archive_url) r = requests.get(archive_url)
parsed = BeautifulSoup( parsed = BeautifulSoup(r.content, 'html.parser')
r.content, 'html.parser')
title = parsed.find_all('title')[ title = parsed.find_all('title')[0].text
0].text
if title == 'Wayback Machine':
title = 'Could not get title'
except: except:
title = "Could not get title" title = "Could not get title"
result = ArchiveResult( result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title)
status='Internet Archive fallback', cdn_url=archive_url, title=title)
self.seen_urls[url] = result self.seen_urls[url] = result
return result return result

Wyświetl plik

@ -3,12 +3,13 @@ import os
import datetime import datetime
import youtube_dl import youtube_dl
from loguru import logger from loguru import logger
from botocore.errorfactory import ClientError
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
class YoutubeDLArchiver(Archiver): class YoutubeDLArchiver(Archiver):
name = "yotube_dl" name = "yotube_dl"
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
@ -32,8 +33,11 @@ class YoutubeDLArchiver(Archiver):
if check_if_exists: if check_if_exists:
if 'entries' in info: if 'entries' in info:
if len(info['entries']) > 1: if len(info['entries']) > 1:
logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
return False
elif len(info['entries']) == 0:
logger.warning( logger.warning(
'YoutubeDLArchiver cannot archive channels or pages with multiple videos') 'YoutubeDLArchiver succeeded but did not find video')
return False return False
filename = ydl.prepare_filename(info['entries'][0]) filename = ydl.prepare_filename(info['entries'][0])
@ -42,20 +46,14 @@ class YoutubeDLArchiver(Archiver):
key = self.get_key(filename) key = self.get_key(filename)
try: if self.storage.exists(key):
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = self.get_cdn_url(key)
status = 'already archived' status = 'already archived'
cdn_url = self.storage.get_cdn_url(key)
except ClientError:
pass
# sometimes this results in a different filename, so do this again # sometimes this results in a different filename, so do this again
info = ydl.extract_info(url, download=True) info = ydl.extract_info(url, download=True)
# TODO: add support for multiple videos
if 'entries' in info: if 'entries' in info:
if len(info['entries']) > 1: if len(info['entries']) > 1:
logger.warning( logger.warning(
@ -70,19 +68,24 @@ class YoutubeDLArchiver(Archiver):
filename = filename.split('.')[0] + '.mkv' filename = filename.split('.')[0] + '.mkv'
if status != 'already archived': if status != 'already archived':
key = self. get_key(filename) key = self.get_key(filename)
cdn_url = self.get_cdn_url(key) cdn_url = self.storage.get_cdn_url(key)
with open(filename, 'rb') as f: self.storage.upload(filename, key)
self.do_s3_upload(f, key)
# get duration # get duration
duration = info['duration'] if 'duration' in info else None duration = info['duration'] if 'duration' in info else None
# get thumbnails # get thumbnails
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) try:
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
except:
key_thumb = ''
thumb_index = 'Could not generate thumbnails'
os.remove(filename) os.remove(filename)
timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None, title=info['title'] if 'title' in info else None, timestamp=timestamp)
timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)

Wyświetl plik

@ -2,12 +2,13 @@ import os
import datetime import datetime
import argparse import argparse
import math import math
import requests
import gspread import gspread
import boto3
from loguru import logger from loguru import logger
from dotenv import load_dotenv from dotenv import load_dotenv
import archivers import archivers
from storages import S3Storage, S3Config
load_dotenv() load_dotenv()
@ -41,7 +42,7 @@ def index_to_col(index):
return alphabet[index] return alphabet[index]
def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v): def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
update = [] update = []
if columns['status'] is not None: if columns['status'] is not None:
@ -103,19 +104,24 @@ def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
def process_sheet(sheet): def process_sheet(sheet):
gc = gspread.service_account(filename='service_account.json') gc = gspread.service_account(filename='service_account.json')
sh = gc.open(sheet) sh = gc.open(sheet)
n_worksheets = len(sh.worksheets())
s3_client = boto3.client('s3', s3_config = S3Config(
region_name=os.getenv('DO_SPACES_REGION'), bucket=os.getenv('DO_BUCKET'),
endpoint_url='https://{}.digitaloceanspaces.com'.format( region=os.getenv('DO_SPACES_REGION'),
os.getenv('DO_SPACES_REGION')), key=os.getenv('DO_SPACES_KEY'),
aws_access_key_id=os.getenv('DO_SPACES_KEY'), secret=os.getenv('DO_SPACES_SECRET')
aws_secret_access_key=os.getenv('DO_SPACES_SECRET')) )
# s3_client = boto3.client('s3',
# region_name=os.getenv('DO_SPACES_REGION'),
# endpoint_url='https://{}.digitaloceanspaces.com'.format(
# os.getenv('DO_SPACES_REGION')),
# aws_access_key_id=os.getenv('DO_SPACES_KEY'),
# aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
# loop through worksheets to check # loop through worksheets to check
for ii in range(n_worksheets): for ii, wks in enumerate(sh.worksheets()):
logger.info("Opening worksheet " + str(ii)) logger.info(f'Opening worksheet {ii}: "{wks.title}"')
wks = sh.get_worksheet(ii)
values = wks.get_all_values() values = wks.get_all_values()
headers = [v.lower() for v in values[0]] headers = [v.lower() for v in values[0]]
@ -126,7 +132,7 @@ def process_sheet(sheet):
'source url')) if 'source url' in headers else None 'source url')) if 'source url' in headers else None
if columns['url'] is None: if columns['url'] is None:
logger.warning("No 'Media URL' column found, skipping") logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
continue continue
url_index = col_to_index(columns['url']) url_index = col_to_index(columns['url'])
@ -153,6 +159,9 @@ def process_sheet(sheet):
columns['duration'] = index_to_col(headers.index( columns['duration'] = index_to_col(headers.index(
'duration')) if 'duration' in headers else None 'duration')) if 'duration' in headers else None
# archives will be in a folder 'doc_name/worksheet_name'
s3_config.folder = f'{sheet}/{wks.title}/'
s3_client = S3Storage(s3_config)
# order matters, first to succeed excludes remaining # order matters, first to succeed excludes remaining
active_archivers = [ active_archivers = [
@ -162,37 +171,43 @@ def process_sheet(sheet):
archivers.WaybackArchiver(s3_client) archivers.WaybackArchiver(s3_client)
] ]
# loop through rows in worksheet # loop through rows in worksheet
for i in range(2, len(values)+1): for i in range(2, len(values) + 1):
v = values[i-1] v = values[i - 1]
url = v[url_index]
if v[url_index] != "" and v[col_to_index(columns['status'])] == "": if url != "" and v[col_to_index(columns['status'])] == "":
latest_val = wks.acell( latest_val = wks.acell(columns['status'] + str(i)).value
columns['status'] + str(i)).value
# check so we don't step on each others' toes # check so we don't step on each others' toes
if latest_val == '' or latest_val is None: if latest_val == '' or latest_val is None:
wks.update( wks.update(columns['status'] + str(i), 'Archive in progress')
columns['status'] + str(i), 'Archive in progress')
# expand short URL links
if 'https://t.co/' in url:
r = requests.get(url)
url = r.url
for archiver in active_archivers: for archiver in active_archivers:
logger.debug(f"Trying {archiver} on row {i}") logger.debug(f"Trying {archiver} on row {i}")
result = archiver.download(v[url_index], check_if_exists=True)
result = archiver.download(url, check_if_exists=True)
if result: if result:
logger.info(f"{archiver} succeeded on row {i}") logger.success(f"{archiver} succeeded on row {i}")
break break
if result: if result:
update_sheet(wks, i, result, columns, v) update_sheet(wks, i, result, columns, v)
else:
wks.update(columns['status'] + str(i), 'failed: no archiver')
# except:
# if any unexpected errors occured, log these into the Google Sheet
# t, value, traceback = sys.exc_info()
# except: # update_sheet(wks, i, str(
# if any unexpected errors occured, log these into the Google Sheet # value), {}, columns, v)
# t, value, traceback = sys.exc_info()
# update_sheet(wks, i, str(
# value), {}, columns, v)
def main(): def main():

Wyświetl plik

@ -0,0 +1,3 @@
# we need to explicitly expose the available imports here
from .base_storage import *
from .s3_storage import *

Wyświetl plik

@ -0,0 +1,19 @@
from abc import ABC, abstractmethod
class Storage(ABC):
@abstractmethod
def __init__(self, config): pass
@abstractmethod
def get_cdn_url(self, path): pass
@abstractmethod
def exists(self, path): pass
@abstractmethod
def uploadf(self, file, key, **kwargs): pass
def upload(self, filename: str, key: str, **kwargs):
with open(filename, 'rb') as f:
self.uploadf(f, key, **kwargs)

Wyświetl plik

@ -0,0 +1,49 @@
import boto3
from botocore.errorfactory import ClientError
from .base_storage import Storage
from dataclasses import dataclass
@dataclass
class S3Config:
bucket: str
region: str
key: str
secret: str
folder: str = ""
class S3Storage(Storage):
def __init__(self, config: S3Config):
self.bucket = config.bucket
self.region = config.region
self.folder = config.folder
if len(self.folder) and self.folder[-1] != '/':
self.folder += '/'
self.s3 = boto3.client(
's3',
region_name=self.region,
endpoint_url=f'https://{self.region}.digitaloceanspaces.com',
aws_access_key_id=config.key,
aws_secret_access_key=config.secret
)
def _get_path(self, key):
return self.folder + key
def get_cdn_url(self, key):
return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
def exists(self, key):
try:
self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
return True
except ClientError:
return False
def uploadf(self, file, key, **kwargs):
extra_args = kwargs["extra_args"] if "extra_args" in kwargs else {'ACL': 'public-read'}
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)