From e4603a942305bcd9ad772d14e21af7cb896ba759 Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Tue, 22 Feb 2022 16:03:35 +0100 Subject: [PATCH] refactoring storage and bringing changes from origin --- __init__.py | 1 + archivers/base_archiver.py | 55 +++++++++---------------- archivers/telegram_archiver.py | 27 ++++-------- archivers/tiktok_archiver.py | 39 +++++++----------- archivers/wayback_archiver.py | 27 ++++++------ archivers/youtubedl_archiver.py | 41 +++++++++--------- auto_archive.py | 73 ++++++++++++++++++++------------- storages/__init__.py | 3 ++ storages/base_storage.py | 19 +++++++++ storages/s3_storage.py | 49 ++++++++++++++++++++++ 10 files changed, 197 insertions(+), 137 deletions(-) create mode 100644 __init__.py create mode 100644 storages/__init__.py create mode 100644 storages/base_storage.py create mode 100644 storages/s3_storage.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..b85e02a --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +from storages import * \ No newline at end of file diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index 3f9f4ac..b13a77f 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -1,17 +1,10 @@ import os import ffmpeg -from dataclasses import dataclass import datetime -from loguru import logger +from dataclasses import dataclass +from abc import ABC, abstractmethod -# TODO There should be a better way of generating keys, that adds the following info: -# - name of sheet that it is being archived from -# (this means we might archive the same media twice on different sheets, but that's OK I think) -# - name of archiver/platform that the video comes from -# This should make it easier to maintain and clean the archive later - -# TODO "check_if_exists" has lots of repeated code across the archivers. Can this be -# cleaned up? Difficult is we don't know the filename until the archivers start working. +from storages import Storage @dataclass @@ -25,33 +18,27 @@ class ArchiveResult: timestamp: datetime.datetime = None -class Archiver: +class Archiver(ABC): name = "default" - def __init__(self, s3_client): - self.s3 = s3_client + def __init__(self, storage: Storage): + self.storage = storage def __str__(self): return self.__class__.__name__ - def download(self, url, check_if_exists=False): - logger.error("method 'download' not implemented") - - def get_cdn_url(self, key): - return 'https://{}.{}.cdn.digitaloceanspaces.com/{}'.format( - os.getenv('DO_BUCKET'), os.getenv('DO_SPACES_REGION'), key) - - def do_s3_upload(self, f, key): - self.s3.upload_fileobj(f, Bucket=os.getenv( - 'DO_BUCKET'), Key=key, ExtraArgs={'ACL': 'public-read'}) + @abstractmethod + def download(self, url, check_if_exists=False): pass def get_key(self, filename): - print(f"key base implementation: {self.name}") - # TODO: refactor to be more manageable - key = filename.split('/')[1] - if 'unknown_video' in key: - key = key.replace('unknown_video', 'jpg') - return key + """ + returns a key in the format "[archiverName]_[filename]" includes extension + """ + tail = os.path.split(filename)[1] # returns filename.ext from full path + _id, extension = os.path.splitext(tail) # returns [filename, .ext] + if 'unknown_video' in _id: + _id = _id.replace('unknown_video', 'jpg') + return f'{self.name}_{_id}{extension}' def get_thumbnails(self, filename, duration=None): if not os.path.exists(filename.split('.')[0]): @@ -80,10 +67,9 @@ class Archiver: thumbnail_filename = filename.split('.')[0] + '/' + fname key = filename.split('/')[1].split('.')[0] + '/' + fname - cdn_url = self.get_cdn_url(key) + cdn_url = self.storage.get_cdn_url(key) - with open(thumbnail_filename, 'rb') as f: - self.do_s3_upload(f, key) + self.storage.upload(thumbnail_filename, key) cdn_urls.append(cdn_url) os.remove(thumbnail_filename) @@ -107,9 +93,8 @@ class Archiver: thumb_index = filename.split('/')[1].split('.')[0] + '/index.html' - self.s3.upload_fileobj(open(index_fname, 'rb'), Bucket=os.getenv( - 'DO_BUCKET'), Key=thumb_index, ExtraArgs={'ACL': 'public-read', 'ContentType': 'text/html'}) + self.storage.upload(index_fname, thumb_index, extra_args={'ACL': 'public-read', 'ContentType': 'text/html'}) - thumb_index_cdn_url = self.get_cdn_url(thumb_index) + thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index) return (key_thumb, thumb_index_cdn_url) diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index d9168e4..16c6ccf 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -1,15 +1,13 @@ import os import requests from bs4 import BeautifulSoup -from botocore.errorfactory import ClientError -from .base_archiver import Archiver, ArchiveResult -# TODO: get_cdn_url, get_thumbnails, do_s3_upload +from .base_archiver import Archiver, ArchiveResult class TelegramArchiver(Archiver): name = "telegram" - + def download(self, url, check_if_exists=False): # detect URLs that we definitely cannot handle if 'http://t.me/' not in url and 'https://t.me/' not in url: @@ -35,19 +33,13 @@ class TelegramArchiver(Archiver): video_url = video.get('src') key = video_url.split('/')[-1].split('?')[0] + key = self.get_key(key) + filename = 'tmp/' + key - if check_if_exists: - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = self.get_cdn_url(key) - - status = 'already archived' - - except ClientError: - pass + if check_if_exists and self.storage.exists(key): + status = 'already archived' + cdn_url = self.storage.get_cdn_url(key) v = requests.get(video_url, headers=headers) @@ -55,10 +47,9 @@ class TelegramArchiver(Archiver): f.write(v.content) if status != 'already archived': - cdn_url = self.get_cdn_url(key) + cdn_url = self.storage.get_cdn_url(key) - with open(filename, 'rb') as f: - self.do_s3_upload(f, key) + self.storage.upload(filename, key) # extract duration from HTML duration = s.find_all('time')[0].contents[0] diff --git a/archivers/tiktok_archiver.py b/archivers/tiktok_archiver.py index 1e3bcaf..e61fec9 100644 --- a/archivers/tiktok_archiver.py +++ b/archivers/tiktok_archiver.py @@ -1,15 +1,13 @@ import os, traceback -from botocore.errorfactory import ClientError import tiktok_downloader from loguru import logger -from .base_archiver import Archiver, ArchiveResult -# TODO: get_cdn_url, do_s3_upload, get_thumbnails +from .base_archiver import Archiver, ArchiveResult class TiktokArchiver(Archiver): name = "tiktok" - + def download(self, url, check_if_exists=False): if 'tiktok.com' not in url: return False @@ -18,35 +16,28 @@ class TiktokArchiver(Archiver): try: info = tiktok_downloader.info_post(url) - key = 'tiktok_' + str(info.id) + '.mp4' + key = self.get_key(f'{info.id}.mp4') + cdn_url = self.get_cdn_url(key) filename = 'tmp/' + key - if check_if_exists: - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) + if check_if_exists and self.storage.exists(key): + status = 'already archived' - # file exists - cdn_url = self.get_cdn_url(key) + media = tiktok_downloader.snaptik(url).get_media() - status = 'already archived' + if len(media) <= 0: + if status == 'already archived': + return ArchiveResult(status='Could not download media, but already archived', cdn_url=cdn_url) + else: + return ArchiveResult(status='Could not download media') - except ClientError: - pass + media[0].download(filename) if status != 'already archived': - media = tiktok_downloader.snaptik(url).get_media() - if len(media) > 0: - media[0].download(filename) - with open(filename, 'rb') as f: - self.do_s3_upload(f, key) - - cdn_url = self.get_cdn_url(key) - else: - status = 'could not download media' + self.storage.upload(filename, key) try: - key_thumb, thumb_index = self.get_thumbnails( - filename, duration=info.duration) + key_thumb, thumb_index = self.get_thumbnails(filename, duration=info.duration) except: key_thumb = '' thumb_index = 'error creating thumbnails' diff --git a/archivers/wayback_archiver.py b/archivers/wayback_archiver.py index a021324..53b356f 100644 --- a/archivers/wayback_archiver.py +++ b/archivers/wayback_archiver.py @@ -1,14 +1,15 @@ import time, requests, os from bs4 import BeautifulSoup +from storages import Storage from .base_archiver import Archiver, ArchiveResult class WaybackArchiver(Archiver): name = "wayback" - - def __init__(self, s3_client): - self.s3 = s3_client + + def __init__(self, storage: Storage): + super(WaybackArchiver, self).__init__(storage) self.seen_urls = {} def download(self, url, check_if_exists=False): @@ -26,10 +27,12 @@ class WaybackArchiver(Archiver): if r.status_code != 200: return ArchiveResult(status="Internet archive failed") + if 'job_id' not in r.json() and 'message' in r.json(): + return ArchiveResult(status=f"Internet archive failed: {r.json()['message']}") + job_id = r.json()['job_id'] - status_r = requests.get( - 'https://web.archive.org/save/status/' + job_id, headers=ia_headers) + status_r = requests.get('https://web.archive.org/save/status/' + job_id, headers=ia_headers) retries = 0 @@ -51,7 +54,7 @@ class WaybackArchiver(Archiver): status_json = status_r.json() if status_json['status'] != 'success': - return ArchiveResult(status='Internet Archive failed: ' + status_json['message']) + return ArchiveResult(status='Internet Archive failed: ' + str(status_json)) archive_url = 'https://web.archive.org/web/' + \ status_json['timestamp'] + '/' + status_json['original_url'] @@ -59,15 +62,15 @@ class WaybackArchiver(Archiver): try: r = requests.get(archive_url) - parsed = BeautifulSoup( - r.content, 'html.parser') + parsed = BeautifulSoup(r.content, 'html.parser') - title = parsed.find_all('title')[ - 0].text + title = parsed.find_all('title')[0].text + + if title == 'Wayback Machine': + title = 'Could not get title' except: title = "Could not get title" - result = ArchiveResult( - status='Internet Archive fallback', cdn_url=archive_url, title=title) + result = ArchiveResult(status='Internet Archive fallback', cdn_url=archive_url, title=title) self.seen_urls[url] = result return result diff --git a/archivers/youtubedl_archiver.py b/archivers/youtubedl_archiver.py index 8249cfa..88f7970 100644 --- a/archivers/youtubedl_archiver.py +++ b/archivers/youtubedl_archiver.py @@ -3,12 +3,13 @@ import os import datetime import youtube_dl from loguru import logger -from botocore.errorfactory import ClientError + from .base_archiver import Archiver, ArchiveResult + class YoutubeDLArchiver(Archiver): name = "yotube_dl" - + def download(self, url, check_if_exists=False): ydl_opts = {'outtmpl': 'tmp/%(id)s.%(ext)s', 'quiet': False} if (url[0:21] == 'https://facebook.com/' or url[0:25] == 'https://wwww.facebook.com/') and os.getenv('FB_COOKIE'): @@ -32,8 +33,11 @@ class YoutubeDLArchiver(Archiver): if check_if_exists: if 'entries' in info: if len(info['entries']) > 1: + logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos') + return False + elif len(info['entries']) == 0: logger.warning( - 'YoutubeDLArchiver cannot archive channels or pages with multiple videos') + 'YoutubeDLArchiver succeeded but did not find video') return False filename = ydl.prepare_filename(info['entries'][0]) @@ -42,20 +46,14 @@ class YoutubeDLArchiver(Archiver): key = self.get_key(filename) - try: - self.s3.head_object(Bucket=os.getenv('DO_BUCKET'), Key=key) - - # file exists - cdn_url = self.get_cdn_url(key) - + if self.storage.exists(key): status = 'already archived' - - except ClientError: - pass + cdn_url = self.storage.get_cdn_url(key) # sometimes this results in a different filename, so do this again info = ydl.extract_info(url, download=True) + # TODO: add support for multiple videos if 'entries' in info: if len(info['entries']) > 1: logger.warning( @@ -70,19 +68,24 @@ class YoutubeDLArchiver(Archiver): filename = filename.split('.')[0] + '.mkv' if status != 'already archived': - key = self. get_key(filename) - cdn_url = self.get_cdn_url(key) + key = self.get_key(filename) + cdn_url = self.storage.get_cdn_url(key) - with open(filename, 'rb') as f: - self.do_s3_upload(f, key) + self.storage.upload(filename, key) # get duration duration = info['duration'] if 'duration' in info else None # get thumbnails - key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) + try: + key_thumb, thumb_index = self.get_thumbnails(filename, duration=duration) + except: + key_thumb = '' + thumb_index = 'Could not generate thumbnails' + os.remove(filename) + timestamp = info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info and info['upload_date'] is not None else None + return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration, - title=info['title'] if 'title' in info else None, - timestamp=info['timestamp'] if 'timestamp' in info else datetime.datetime.strptime(info['upload_date'], '%Y%m%d').timestamp() if 'upload_date' in info else None) + title=info['title'] if 'title' in info else None, timestamp=timestamp) diff --git a/auto_archive.py b/auto_archive.py index c478463..36bbadb 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -2,12 +2,13 @@ import os import datetime import argparse import math +import requests import gspread -import boto3 from loguru import logger from dotenv import load_dotenv import archivers +from storages import S3Storage, S3Config load_dotenv() @@ -41,7 +42,7 @@ def index_to_col(index): return alphabet[index] -def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v): +def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v): update = [] if columns['status'] is not None: @@ -103,19 +104,24 @@ def update_sheet(wks, row, result : archivers.ArchiveResult, columns, v): def process_sheet(sheet): gc = gspread.service_account(filename='service_account.json') sh = gc.open(sheet) - n_worksheets = len(sh.worksheets()) - s3_client = boto3.client('s3', - region_name=os.getenv('DO_SPACES_REGION'), - endpoint_url='https://{}.digitaloceanspaces.com'.format( - os.getenv('DO_SPACES_REGION')), - aws_access_key_id=os.getenv('DO_SPACES_KEY'), - aws_secret_access_key=os.getenv('DO_SPACES_SECRET')) + s3_config = S3Config( + bucket=os.getenv('DO_BUCKET'), + region=os.getenv('DO_SPACES_REGION'), + key=os.getenv('DO_SPACES_KEY'), + secret=os.getenv('DO_SPACES_SECRET') + ) + + # s3_client = boto3.client('s3', + # region_name=os.getenv('DO_SPACES_REGION'), + # endpoint_url='https://{}.digitaloceanspaces.com'.format( + # os.getenv('DO_SPACES_REGION')), + # aws_access_key_id=os.getenv('DO_SPACES_KEY'), + # aws_secret_access_key=os.getenv('DO_SPACES_SECRET')) # loop through worksheets to check - for ii in range(n_worksheets): - logger.info("Opening worksheet " + str(ii)) - wks = sh.get_worksheet(ii) + for ii, wks in enumerate(sh.worksheets()): + logger.info(f'Opening worksheet {ii}: "{wks.title}"') values = wks.get_all_values() headers = [v.lower() for v in values[0]] @@ -126,7 +132,7 @@ def process_sheet(sheet): 'source url')) if 'source url' in headers else None if columns['url'] is None: - logger.warning("No 'Media URL' column found, skipping") + logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}') continue url_index = col_to_index(columns['url']) @@ -153,6 +159,9 @@ def process_sheet(sheet): columns['duration'] = index_to_col(headers.index( 'duration')) if 'duration' in headers else None + # archives will be in a folder 'doc_name/worksheet_name' + s3_config.folder = f'{sheet}/{wks.title}/' + s3_client = S3Storage(s3_config) # order matters, first to succeed excludes remaining active_archivers = [ @@ -162,37 +171,43 @@ def process_sheet(sheet): archivers.WaybackArchiver(s3_client) ] - # loop through rows in worksheet - for i in range(2, len(values)+1): - v = values[i-1] + for i in range(2, len(values) + 1): + v = values[i - 1] + url = v[url_index] - if v[url_index] != "" and v[col_to_index(columns['status'])] == "": - latest_val = wks.acell( - columns['status'] + str(i)).value + if url != "" and v[col_to_index(columns['status'])] == "": + latest_val = wks.acell(columns['status'] + str(i)).value # check so we don't step on each others' toes if latest_val == '' or latest_val is None: - wks.update( - columns['status'] + str(i), 'Archive in progress') + wks.update(columns['status'] + str(i), 'Archive in progress') + + # expand short URL links + if 'https://t.co/' in url: + r = requests.get(url) + url = r.url for archiver in active_archivers: logger.debug(f"Trying {archiver} on row {i}") - result = archiver.download(v[url_index], check_if_exists=True) + + result = archiver.download(url, check_if_exists=True) + if result: - logger.info(f"{archiver} succeeded on row {i}") + logger.success(f"{archiver} succeeded on row {i}") break if result: update_sheet(wks, i, result, columns, v) + else: + wks.update(columns['status'] + str(i), 'failed: no archiver') + # except: + # if any unexpected errors occured, log these into the Google Sheet + # t, value, traceback = sys.exc_info() - # except: - # if any unexpected errors occured, log these into the Google Sheet - # t, value, traceback = sys.exc_info() - - # update_sheet(wks, i, str( - # value), {}, columns, v) + # update_sheet(wks, i, str( + # value), {}, columns, v) def main(): diff --git a/storages/__init__.py b/storages/__init__.py new file mode 100644 index 0000000..3054d36 --- /dev/null +++ b/storages/__init__.py @@ -0,0 +1,3 @@ +# we need to explicitly expose the available imports here +from .base_storage import * +from .s3_storage import * \ No newline at end of file diff --git a/storages/base_storage.py b/storages/base_storage.py new file mode 100644 index 0000000..050a8eb --- /dev/null +++ b/storages/base_storage.py @@ -0,0 +1,19 @@ +from abc import ABC, abstractmethod + + +class Storage(ABC): + @abstractmethod + def __init__(self, config): pass + + @abstractmethod + def get_cdn_url(self, path): pass + + @abstractmethod + def exists(self, path): pass + + @abstractmethod + def uploadf(self, file, key, **kwargs): pass + + def upload(self, filename: str, key: str, **kwargs): + with open(filename, 'rb') as f: + self.uploadf(f, key, **kwargs) diff --git a/storages/s3_storage.py b/storages/s3_storage.py new file mode 100644 index 0000000..188db7e --- /dev/null +++ b/storages/s3_storage.py @@ -0,0 +1,49 @@ +import boto3 +from botocore.errorfactory import ClientError +from .base_storage import Storage +from dataclasses import dataclass + + +@dataclass +class S3Config: + bucket: str + region: str + key: str + secret: str + folder: str = "" + + +class S3Storage(Storage): + + def __init__(self, config: S3Config): + self.bucket = config.bucket + self.region = config.region + self.folder = config.folder + + if len(self.folder) and self.folder[-1] != '/': + self.folder += '/' + + self.s3 = boto3.client( + 's3', + region_name=self.region, + endpoint_url=f'https://{self.region}.digitaloceanspaces.com', + aws_access_key_id=config.key, + aws_secret_access_key=config.secret + ) + + def _get_path(self, key): + return self.folder + key + + def get_cdn_url(self, key): + return f'https://{self.bucket}.{self.region}.cdn.digitaloceanspaces.com/{self._get_path(key)}' + + def exists(self, key): + try: + self.s3.head_object(Bucket=self.bucket, Key=self._get_path(key)) + return True + except ClientError: + return False + + def uploadf(self, file, key, **kwargs): + extra_args = kwargs["extra_args"] if "extra_args" in kwargs else {'ACL': 'public-read'} + self.s3.upload_fileobj(file, Bucket=self.bucket, Key=self._get_path(key), ExtraArgs=extra_args)