diff --git a/archivers/base_archiver.py b/archivers/base_archiver.py index a55b1ca..1e2c20b 100644 --- a/archivers/base_archiver.py +++ b/archivers/base_archiver.py @@ -43,6 +43,7 @@ class Archiver(ABC): def get_netloc(self, url): return urlparse(url).netloc + def get_key(self, filename): """ returns a key in the format "[archiverName]_[filename]" includes extension diff --git a/archivers/telegram_archiver.py b/archivers/telegram_archiver.py index c43fc7d..d6207df 100644 --- a/archivers/telegram_archiver.py +++ b/archivers/telegram_archiver.py @@ -55,8 +55,8 @@ class TelegramArchiver(Archiver): # extract duration from HTML duration = s.find_all('time')[0].contents[0] if ':' in duration: - duration = float(duration.split(':')[0]) * 60 - + float(duration.split(':')[1]) + duration = float(duration.split( + ':')[0]) * 60 + float(duration.split(':')[1]) else: duration = float(duration) diff --git a/auto_archive.py b/auto_archive.py index 211d3d7..6b6917d 100644 --- a/auto_archive.py +++ b/auto_archive.py @@ -6,6 +6,7 @@ import shutil import gspread from loguru import logger from dotenv import load_dotenv +from selenium import webdriver import archivers from storages import S3Storage, S3Config @@ -27,13 +28,17 @@ def update_sheet(gw, row, result: archivers.ArchiveResult): batch_if_valid('archive', result.cdn_url) batch_if_valid('date', True, datetime.datetime.now().isoformat()) - batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")') + batch_if_valid('thumbnail', result.thumbnail, + f'=IMAGE("{result.thumbnail}")') batch_if_valid('thumbnail_index', result.thumbnail_index) batch_if_valid('title', result.title) batch_if_valid('duration', result.duration, str(result.duration)) + batch_if_valid('screenshot', result.screenshot) + batch_if_valid('hash', result.hash) if result.timestamp and type(result.timestamp) != str: - result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat() + result.timestamp = datetime.datetime.fromtimestamp( + result.timestamp).isoformat() batch_if_valid('timestamp', result.timestamp) gw.batch_set_cell(cell_updates) @@ -50,7 +55,7 @@ def expand_url(url): return url -def process_sheet(sheet): +def process_sheet(sheet, header=1): gc = gspread.service_account(filename='service_account.json') sh = gc.open(sheet) @@ -61,73 +66,97 @@ def process_sheet(sheet): secret=os.getenv('DO_SPACES_SECRET') ) + driver = webdriver.Firefox() + driver.set_window_size(1400, 2000) + # loop through worksheets to check for ii, wks in enumerate(sh.worksheets()): logger.info(f'Opening worksheet {ii}: "{wks.title}"') - gw = GWorksheet(wks) + gw = GWorksheet(wks, header_row=header) if not gw.col_exists('url'): - logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}') + logger.warning( + f'No "Media URL" column found, skipping worksheet {wks.title}') continue if not gw.col_exists('status'): - logger.warning(f'No "Archive status" column found, skipping worksheet {wks.title}') + logger.warning( + f'No "Archive status" column found, skipping worksheet {wks.title}') continue # archives will be in a folder 'doc_name/worksheet_name' - s3_config.folder = f'{sheet}/{wks.title}/' + s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/' s3_client = S3Storage(s3_config) # order matters, first to succeed excludes remaining active_archivers = [ - archivers.TelegramArchiver(s3_client), - archivers.TiktokArchiver(s3_client), - archivers.YoutubeDLArchiver(s3_client), - archivers.WaybackArchiver(s3_client) + archivers.TelegramArchiver(s3_client, driver), + archivers.TiktokArchiver(s3_client, driver), + archivers.YoutubeDLArchiver(s3_client, driver), + archivers.TwitterArchiver(s3_client, driver), + archivers.WaybackArchiver(s3_client, driver) ] + values = gw.get_values() # loop through rows in worksheet - for row in range(2, gw.count_rows() + 1): - url = gw.get_cell(row, 'url') - status = gw.get_cell(row, 'status') + for row in range(1 + header, gw.count_rows() + 1): + row_values = values[row-1] + url = gw.get_cell(row_values, 'url') + status = gw.get_cell(row_values, 'status') if url != '' and status in ['', None]: - gw.set_cell(row, 'status', 'Archive in progress') + url = gw.get_cell(row, 'url') + status = gw.get_cell(status, 'status') - url = expand_url(url) + if url != '' and status in ['', None]: + gw.set_cell(row, 'status', 'Archive in progress') - for archiver in active_archivers: - logger.debug(f'Trying {archiver} on row {row}') + url = expand_url(url) - # TODO: add support for multiple videos/images - try: - result = archiver.download(url, check_if_exists=True) - except Exception as e: - result = False - logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}') + for archiver in active_archivers: + logger.debug(f'Trying {archiver} on row {row}') + + # TODO: add support for multiple videos/images + # try: + result = archiver.download( + url, check_if_exists=True) + # except Exception as e: + # result = False + # logger.error( + # f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}') + + if result: + if result.status in ['success', 'already archived']: + result.status = archiver.name + \ + ": " + str(result.status) + logger.success( + f'{archiver} succeeded on row {row}') + break + logger.warning( + f'{archiver} did not succeed on row {row}, final status: {result.status}') + result.status = archiver.name + \ + ": " + str(result.status) if result: - if result.status in ['success', 'already archived']: - logger.success(f'{archiver} succeeded on row {row}') - break - logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}') + update_sheet(gw, row, result) + else: + gw.set_cell(row, 'status', 'failed: no archiver') - if result: - update_sheet(gw, row, result) - else: - gw.set_cell(row, 'status', 'failed: no archiver') + driver.quit() def main(): parser = argparse.ArgumentParser( description='Automatically archive social media videos from a Google Sheets document') parser.add_argument('--sheet', action='store', dest='sheet') + parser.add_argument('--header', action='store', dest='header', default=1, type=int) args = parser.parse_args() logger.info(f'Opening document {args.sheet}') mkdir_if_not_exists('tmp') - process_sheet(args.sheet) + process_sheet(args.sheet, header=args.header) shutil.rmtree('tmp') + if __name__ == '__main__': main() diff --git a/utils/gworksheet.py b/utils/gworksheet.py index 4349e2a..f7f0549 100644 --- a/utils/gworksheet.py +++ b/utils/gworksheet.py @@ -3,7 +3,7 @@ from gspread import utils class GWorksheet: COLUMN_NAMES = { - 'url': 'media url', + 'url': 'link', 'archive': 'archive location', 'date': 'archive date', 'status': 'archive status', @@ -11,12 +11,14 @@ class GWorksheet: 'thumbnail_index': 'thumbnail index', 'timestamp': 'upload timestamp', 'title': 'upload title', - 'duration': 'duration' + 'duration': 'duration', + 'screenshot': 'screenshot', + 'hash': 'hash' } - def __init__(self, worksheet, columns=COLUMN_NAMES): + def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1): self.wks = worksheet - self.headers = [v.lower() for v in self.wks.row_values(1)] + self.headers = [v.lower() for v in self.wks.row_values(header_row)] self.columns = columns def _check_col_exists(self, col: str): @@ -38,6 +40,9 @@ class GWorksheet: # row is 1-based return self.wks.row_values(row) + def get_values(self): + return self.wks.get_values() + def get_cell(self, row, col: str): """ returns the cell value from (row, col),