refactoring storage and bringing changes from origin

pull/13/head
msramalho 2022-02-22 16:03:35 +01:00
rodzic f3ce226665
commit e4603a9423
10 zmienionych plików z 197 dodań i 137 usunięć

1
__init__.py 100644
Wyświetl plik

@ -0,0 +1 @@
from storages import *

Wyświetl plik

@ -1,17 +1,10 @@
import os
import ffmpeg
from dataclasses import dataclass
import datetime
from loguru import logger
from dataclasses import dataclass
from abc import ABC, abstractmethod
# TODO There should be a better way of generating keys, that adds the following info:
# - name of sheet that it is being archived from
# (this means we might archive the same media twice on different sheets, but that's OK I think)
# - name of archiver/platform that the video comes from
# This should make it easier to maintain and clean the archive later
# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be
# cleaned up? Difficult is we don't know the filename until the archivers start working.
from storages import Storage
@dataclass
@ -25,33 +18,27 @@ class ArchiveResult:
timestamp: datetime.datetime = None
class Archiver:
class Archiver(ABC):
name = "default"
def __init__(self, s3_client):
self.s3 = s3_client
def __init__(self, storage: Storage):
self.storage = storage
def __str__(self):
return self.__class__.__name__
def download(self, url, check_if_exists=False):
logger.error("method 'download' not implemented")
def get_cdn_url(self, key):
return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format(
os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key)
def do_s3_upload(self, f, key):
self.s3.upload_fileobj(f, Bucket=os.getenv(
'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'})
@abstractmethod
def download(self, url, check_if_exists=False): pass
def get_key(self, filename):
print(f"key base implementation: {self.name}")
# TODO: refactor to be more manageable
key = filename.split('/')[1]
if 'unknown_video' in key:
key = key.replace('unknown_video', 'jpg')
return key
"""
returns a key in the format "[archiverName]_[filename]" includes extension
"""
tail = os.path.split(filename)[1] # returns filename.ext from full path
_id, extension = os.path.splitext(tail) # returns [filename, .ext]
if 'unknown_video' in _id:
_id = _id.replace('unknown_video', 'jpg')
return f'{self.name}_{_id}{extension}'
def get_thumbnails(self, filename, duration=None):
if not os.path.exists(filename.split('.')[0]):
@ -80,10 +67,9 @@ class Archiver:
thumbnail_filename = filename.split('.')[0] + '/' + fname
key = filename.split('/')[1].split('.')[0] + '/' + fname
cdn_url = self.get_cdn_url(key)
cdn_url = self.storage.get_cdn_url(key)
with open(thumbnail_filename, 'rb') as f:
self.do_s3_upload(f, key)
self.storage.upload(thumbnail_filename, key)
cdn_urls.append(cdn_url)
os.remove(thumbnail_filename)
@ -107,9 +93,8 @@ class Archiver:
thumb_index = filename.split('/')[1].split('.')[0] + '/index.html'
self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv(
'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'})
self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'})
thumb_index_cdn_url = self.get_cdn_url(thumb_index)
thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
return (key_thumb, thumb_index_cdn_url)

Wyświetl plik

@ -1,15 +1,13 @@
import os
import requests
from bs4 import BeautifulSoup
from botocore.errorfactory import ClientError
from .base_archiver import Archiver, ArchiveResult
# TODO: get_cdn_url, get_thumbnails, do_s3_upload
from .base_archiver import Archiver, ArchiveResult
class TelegramArchiver(Archiver):
name = "telegram"
def download(self, url, check_if_exists=False):
# detect URLs that we definitely cannot handle
if 'http://t.me/' not in url and 'https://t.me/' not in url:
@ -35,19 +33,13 @@ class TelegramArchiver(Archiver):
video_url = video.get('src')
key = video_url.split('/')[-1].split('?')[0]
key = self.get_key(key)
filename = 'tmp/' + key
if check_if_exists:
try:
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = self.get_cdn_url(key)
status = 'already archived'
except ClientError:
pass
if check_if_exists and self.storage.exists(key):
status = 'already archived'
cdn_url = self.storage.get_cdn_url(key)
v = requests.get(video_url, headers=headers)
@ -55,10 +47,9 @@ class TelegramArchiver(Archiver):
f.write(v.content)
if status != 'already archived':
cdn_url = self.get_cdn_url(key)
cdn_url = self.storage.get_cdn_url(key)
with open(filename, 'rb') as f:
self.do_s3_upload(f, key)
self.storage.upload(filename, key)
# extract duration from HTML
duration = s.find_all('time')[0].contents[0]

Wyświetl plik

@ -1,15 +1,13 @@
import os, traceback
from botocore.errorfactory import ClientError
import tiktok_downloader
from loguru import logger
from .base_archiver import Archiver, ArchiveResult
# TODO: get_cdn_url, do_s3_upload, get_thumbnails
from .base_archiver import Archiver, ArchiveResult
class TiktokArchiver(Archiver):
name = "tiktok"
def download(self, url, check_if_exists=False):
if 'tiktok.com' not in url:
return False
@ -18,35 +16,28 @@ class TiktokArchiver(Archiver):
try:
info = tiktok_downloader.info_post(url)
key = 'tiktok_' + str(info.id) + '.mp4'
key = self.get_key(f'{info.id}.mp4')
cdn_url = self.get_cdn_url(key)
filename = 'tmp/' + key
if check_if_exists:
try:
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
if check_if_exists and self.storage.exists(key):
status = 'already archived'
# file exists
cdn_url = self.get_cdn_url(key)
media = tiktok_downloader.snaptik(url).get_media()
status = 'already archived'
if len(media) <= 0:
if status == 'already archived':
return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url)
else:
return ArchiveResult(status='Could not download media')
except ClientError:
pass
media[0].download(filename)
if status != 'already archived':
media = tiktok_downloader.snaptik(url).get_media()
if len(media) > 0:
media[0].download(filename)
with open(filename, 'rb') as f:
self.do_s3_upload(f, key)
cdn_url = self.get_cdn_url(key)
else:
status = 'could not download media'
self.storage.upload(filename, key)
try:
key_thumb, thumb_index = self.get_thumbnails(
filename, duration=info.duration)
key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration)
except:
key_thumb = ''
thumb_index = 'error creating thumbnails'

Wyświetl plik

@ -1,14 +1,15 @@
import time, requests, os
from bs4 import BeautifulSoup
from storages import Storage
from .base_archiver import Archiver, ArchiveResult
class WaybackArchiver(Archiver):
name = "wayback"
def __init__(self, s3_client):
self.s3 = s3_client
def __init__(self, storage: Storage):
super(WaybackArchiver, self).__init__(storage)
self.seen_urls = {}
def download(self, url, check_if_exists=False):
@ -26,10 +27,12 @@ class WaybackArchiver(Archiver):
if r.status_code != 200:
return ArchiveResult(status="Internet archive failed")
if 'job_id' not in r.json() and 'message' in r.json():
return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}")
job_id = r.json()['job_id']
status_r = requests.get(
'https://web.archive.org/save/status/' + job_id, headers=ia_headers)
status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers)
retries = 0
@ -51,7 +54,7 @@ class WaybackArchiver(Archiver):
status_json = status_r.json()
if status_json['status'] != 'success':
return ArchiveResult(status='Internet Archive failed: ' + status_json['message'])
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
archive_url = 'https://web.archive.org/web/' + \
status_json['timestamp'] + '/' + status_json['original_url']
@ -59,15 +62,15 @@ class WaybackArchiver(Archiver):
try:
r = requests.get(archive_url)
parsed = BeautifulSoup(
r.content, 'html.parser')
parsed = BeautifulSoup(r.content, 'html.parser')
title = parsed.find_all('title')[
0].text
title = parsed.find_all('title')[0].text
if title == 'Wayback Machine':
title = 'Could not get title'
except:
title = "Could not get title"
result = ArchiveResult(
status='Internet Archive fallback', cdn_url=archive_url, title=title)
result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title)
self.seen_urls[url] = result
return result

Wyświetl plik

@ -3,12 +3,13 @@ import os
import datetime
import youtube_dl
from loguru import logger
from botocore.errorfactory import ClientError
from .base_archiver import Archiver, ArchiveResult
class YoutubeDLArchiver(Archiver):
name = "yotube_dl"
def download(self, url, check_if_exists=False):
ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False}
if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'):
@ -32,8 +33,11 @@ class YoutubeDLArchiver(Archiver):
if check_if_exists:
if 'entries' in info:
if len(info['entries']) > 1:
logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
return False
elif len(info['entries']) == 0:
logger.warning(
'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
'YoutubeDLArchiver succeeded but did not find video')
return False
filename = ydl.prepare_filename(info['entries'][0])
@ -42,20 +46,14 @@ class YoutubeDLArchiver(Archiver):
key = self.get_key(filename)
try:
self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key)
# file exists
cdn_url = self.get_cdn_url(key)
if self.storage.exists(key):
status = 'already archived'
except ClientError:
pass
cdn_url = self.storage.get_cdn_url(key)
# sometimes this results in a different filename, so do this again
info = ydl.extract_info(url, download=True)
# TODO: add support for multiple videos
if 'entries' in info:
if len(info['entries']) > 1:
logger.warning(
@ -70,19 +68,24 @@ class YoutubeDLArchiver(Archiver):
filename = filename.split('.')[0] + '.mkv'
if status != 'already archived':
key = self. get_key(filename)
cdn_url = self.get_cdn_url(key)
key = self.get_key(filename)
cdn_url = self.storage.get_cdn_url(key)
with open(filename, 'rb') as f:
self.do_s3_upload(f, key)
self.storage.upload(filename, key)
# get duration
duration = info['duration'] if 'duration' in info else None
# get thumbnails
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
try:
key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration)
except:
key_thumb = ''
thumb_index = 'Could not generate thumbnails'
os.remove(filename)
timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
title=info['title'] if 'title' in info else None,
timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None)
title=info['title'] if 'title' in info else None, timestamp=timestamp)

Wyświetl plik

@ -2,12 +2,13 @@ import os
import datetime
import argparse
import math
import requests
import gspread
import boto3
from loguru import logger
from dotenv import load_dotenv
import archivers
from storages import S3Storage, S3Config
load_dotenv()
@ -41,7 +42,7 @@ def index_to_col(index):
return alphabet[index]
def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
update = []
if columns['status'] is not None:
@ -103,19 +104,24 @@ def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v):
def process_sheet(sheet):
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(sheet)
n_worksheets = len(sh.worksheets())
s3_client = boto3.client('s3',
region_name=os.getenv('DO_SPACES_REGION'),
endpoint_url='https://{}.digitaloceanspaces.com'.format(
os.getenv('DO_SPACES_REGION')),
aws_access_key_id=os.getenv('DO_SPACES_KEY'),
aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
s3_config = S3Config(
bucket=os.getenv('DO_BUCKET'),
region=os.getenv('DO_SPACES_REGION'),
key=os.getenv('DO_SPACES_KEY'),
secret=os.getenv('DO_SPACES_SECRET')
)
# s3_client = boto3.client('s3',
# region_name=os.getenv('DO_SPACES_REGION'),
# endpoint_url='https://{}.digitaloceanspaces.com'.format(
# os.getenv('DO_SPACES_REGION')),
# aws_access_key_id=os.getenv('DO_SPACES_KEY'),
# aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
# loop through worksheets to check
for ii in range(n_worksheets):
logger.info("Opening worksheet " + str(ii))
wks = sh.get_worksheet(ii)
for ii, wks in enumerate(sh.worksheets()):
logger.info(f'Opening worksheet {ii}: "{wks.title}"')
values = wks.get_all_values()
headers = [v.lower() for v in values[0]]
@ -126,7 +132,7 @@ def process_sheet(sheet):
'source url')) if 'source url' in headers else None
if columns['url'] is None:
logger.warning("No 'Media URL' column found, skipping")
logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
continue
url_index = col_to_index(columns['url'])
@ -153,6 +159,9 @@ def process_sheet(sheet):
columns['duration'] = index_to_col(headers.index(
'duration')) if 'duration' in headers else None
# archives will be in a folder 'doc_name/worksheet_name'
s3_config.folder = f'{sheet}/{wks.title}/'
s3_client = S3Storage(s3_config)
# order matters, first to succeed excludes remaining
active_archivers = [
@ -162,37 +171,43 @@ def process_sheet(sheet):
archivers.WaybackArchiver(s3_client)
]
# loop through rows in worksheet
for i in range(2, len(values)+1):
v = values[i-1]
for i in range(2, len(values) + 1):
v = values[i - 1]
url = v[url_index]
if v[url_index] != "" and v[col_to_index(columns['status'])] == "":
latest_val = wks.acell(
columns['status'] + str(i)).value
if url != "" and v[col_to_index(columns['status'])] == "":
latest_val = wks.acell(columns['status'] + str(i)).value
# check so we don't step on each others' toes
if latest_val == '' or latest_val is None:
wks.update(
columns['status'] + str(i), 'Archive in progress')
wks.update(columns['status'] + str(i), 'Archive in progress')
# expand short URL links
if 'https://t.co/' in url:
r = requests.get(url)
url = r.url
for archiver in active_archivers:
logger.debug(f"Trying {archiver} on row {i}")
result = archiver.download(v[url_index], check_if_exists=True)
result = archiver.download(url, check_if_exists=True)
if result:
logger.info(f"{archiver} succeeded on row {i}")
logger.success(f"{archiver} succeeded on row {i}")
break
if result:
update_sheet(wks, i, result, columns, v)
else:
wks.update(columns['status'] + str(i), 'failed: no archiver')
# except:
# if any unexpected errors occured, log these into the Google Sheet
# t, value, traceback = sys.exc_info()
# except:
# if any unexpected errors occured, log these into the Google Sheet
# t, value, traceback = sys.exc_info()
# update_sheet(wks, i, str(
# value), {}, columns, v)
# update_sheet(wks, i, str(
# value), {}, columns, v)
def main():

Wyświetl plik

@ -0,0 +1,3 @@
# we need to explicitly expose the available imports here
from .base_storage import *
from .s3_storage import *

Wyświetl plik

@ -0,0 +1,19 @@
from abc import ABC, abstractmethod
class Storage(ABC):
@abstractmethod
def __init__(self, config): pass
@abstractmethod
def get_cdn_url(self, path): pass
@abstractmethod
def exists(self, path): pass
@abstractmethod
def uploadf(self, file, key, **kwargs): pass
def upload(self, filename: str, key: str, **kwargs):
with open(filename, 'rb') as f:
self.uploadf(f, key, **kwargs)

Wyświetl plik

@ -0,0 +1,49 @@
import boto3
from botocore.errorfactory import ClientError
from .base_storage import Storage
from dataclasses import dataclass
@dataclass
class S3Config:
bucket: str
region: str
key: str
secret: str
folder: str = ""
class S3Storage(Storage):
def __init__(self, config: S3Config):
self.bucket = config.bucket
self.region = config.region
self.folder = config.folder
if len(self.folder) and self.folder[-1] != '/':
self.folder += '/'
self.s3 = boto3.client(
's3',
region_name=self.region,
endpoint_url=f'https://{self.region}.digitaloceanspaces.com',
aws_access_key_id=config.key,
aws_secret_access_key=config.secret
)
def _get_path(self, key):
return self.folder + key
def get_cdn_url(self, key):
return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}'
def exists(self, key):
try:
self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key))
return True
except ClientError:
return False
def uploadf(self, file, key, **kwargs):
extra_args = kwargs["extra_args"] if "extra_args" in kwargs else {'ACL': 'public-read'}
self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)