auto-archiver/auto_archive.py

151 wiersze
6.2 KiB
Python
Czysty Zwykły widok Historia

2022-06-08 11:39:52 +00:00
import os, datetime, shutil, traceback, random
2022-06-07 16:41:58 +00:00
2022-06-03 16:03:49 +00:00
from loguru import logger
2022-06-07 16:41:58 +00:00
from slugify import slugify
2022-02-21 13:19:09 +00:00
2022-06-08 11:39:52 +00:00
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult, Archiver
2022-05-09 16:19:38 +00:00
from utils import GWorksheet, mkdir_if_not_exists, expand_url
2022-05-03 18:34:04 +00:00
from configs import Config
2022-06-07 16:41:58 +00:00
from storages import Storage
2022-06-03 15:32:55 +00:00
2022-06-08 11:39:52 +00:00
random.seed()
2022-05-03 18:34:04 +00:00
def update_sheet(gw, row, result: ArchiveResult):
cell_updates = []
row_values = gw.get_row(row)
2021-05-03 12:16:09 +00:00
2022-02-23 08:54:03 +00:00
def batch_if_valid(col, val, final_value=None):
final_value = final_value or val
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
cell_updates.append((row, col, final_value))
2021-03-18 10:03:13 +00:00
cell_updates.append((row, 'status', result.status))
2021-03-15 09:08:02 +00:00
2022-02-23 08:54:03 +00:00
batch_if_valid('archive', result.cdn_url)
batch_if_valid('date', True, datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat())
2022-06-03 16:03:49 +00:00
batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
2022-02-23 08:54:03 +00:00
batch_if_valid('thumbnail_index', result.thumbnail_index)
batch_if_valid('title', result.title)
batch_if_valid('duration', result.duration, str(result.duration))
2022-02-25 15:09:35 +00:00
batch_if_valid('screenshot', result.screenshot)
batch_if_valid('hash', result.hash)
if result.timestamp is not None:
if type(result.timestamp) == int:
timestamp_string = datetime.datetime.fromtimestamp(result.timestamp).replace(tzinfo=datetime.timezone.utc).isoformat()
elif type(result.timestamp) == str:
timestamp_string = result.timestamp
else:
timestamp_string = result.timestamp.isoformat()
batch_if_valid('timestamp', timestamp_string)
2021-03-15 09:08:02 +00:00
gw.batch_set_cell(cell_updates)
2022-06-03 16:03:49 +00:00
def missing_required_columns(gw: GWorksheet):
2022-06-07 16:41:58 +00:00
missing = False
2022-06-03 16:03:49 +00:00
for required_col in ['url', 'status']:
if not gw.col_exists(required_col):
2022-06-07 16:41:58 +00:00
logger.warning(f'Required column for {required_col}: "{gw.columns[required_col]}" not found, skipping worksheet {gw.wks.title}')
missing = True
return missing
2022-06-03 16:03:49 +00:00
def process_sheet(c: Config):
sh = c.gsheets_client.open(c.sheet)
# loop through worksheets to check
for ii, wks in enumerate(sh.worksheets()):
2022-06-03 16:03:49 +00:00
logger.info(f'Opening worksheet {ii=}: {wks.title=} {c.header=}')
gw = GWorksheet(wks, header_row=c.header, columns=c.column_names)
2021-03-15 09:08:02 +00:00
2022-06-03 16:03:49 +00:00
if missing_required_columns(gw): continue
2021-03-25 12:42:42 +00:00
2022-06-07 16:41:58 +00:00
# archives will default to being in a folder 'doc_name/worksheet_name'
default_folder = os.path.join(slugify(c.sheet), slugify(wks.title))
c.set_folder(default_folder)
2022-05-09 12:54:48 +00:00
storage = c.get_storage()
# loop through rows in worksheet
2022-06-03 16:03:49 +00:00
for row in range(1 + c.header, gw.count_rows() + 1):
url = gw.get_cell(row, 'url')
original_status = gw.get_cell(row, 'status')
2022-03-14 10:10:51 +00:00
status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
2022-06-08 11:39:52 +00:00
is_retry = False
if url == '' or status not in ['', None]:
is_retry = Archiver.should_retry_from_status(status)
if not is_retry: continue
2022-06-03 16:03:49 +00:00
# All checks done - archival process starts here
try:
gw.set_cell(row, 'status', 'Archive in progress')
url = expand_url(url)
c.set_folder(gw.get_cell_or_default(row, 'folder', default_folder, when_empty_use_default=True))
# make a new driver so each spreadsheet row is idempotent
c.recreate_webdriver()
# order matters, first to succeed excludes remaining
active_archivers = [
TelethonArchiver(storage, c.webdriver, c.telegram_config),
TiktokArchiver(storage, c.webdriver),
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
TelegramArchiver(storage, c.webdriver),
TwitterArchiver(storage, c.webdriver),
WaybackArchiver(storage, c.webdriver, c.wayback_config)
]
for archiver in active_archivers:
logger.debug(f'Trying {archiver} on {row=}')
try:
result = archiver.download(url, check_if_exists=True)
except KeyboardInterrupt as e: raise e # so the higher level catch can catch it
except Exception as e:
result = False
logger.error(f'Got unexpected error in row {row} with {archiver.name} for {url=}: {e}\n{traceback.format_exc()}')
if result:
success = result.status in ['success', 'already archived']
result.status = f"{archiver.name}: {result.status}"
if success:
logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
break
# only 1 retry possible for now
if is_retry and Archiver.is_retry(result.status):
result.status = Archiver.remove_retry(result.status)
logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
2022-06-03 16:03:49 +00:00
2022-03-09 10:46:14 +00:00
if result:
update_sheet(gw, row, result)
else:
gw.set_cell(row, 'status', 'failed: no archiver')
except KeyboardInterrupt:
# catches keyboard interruptions to do a clean exit
logger.warning(f"caught interrupt on {row=}, {url=}")
gw.set_cell(row, 'status', '')
c.destroy_webdriver()
exit()
except Exception as e:
logger.error(f'Got unexpected error in row {row} for {url=}: {e}\n{traceback.format_exc()}')
gw.set_cell(row, 'status', 'failed: unexpected error (see logs)')
logger.success(f'Finished worksheet {wks.title}')
2021-03-15 09:08:02 +00:00
2022-05-10 21:09:33 +00:00
2022-05-09 10:55:10 +00:00
@logger.catch
2021-03-25 12:42:42 +00:00
def main():
2022-05-03 18:34:04 +00:00
c = Config()
c.parse()
logger.info(f'Opening document {c.sheet} for header {c.header}')
2022-06-07 16:41:58 +00:00
mkdir_if_not_exists(Storage.TMP_FOLDER)
2022-06-03 16:03:49 +00:00
process_sheet(c)
2022-05-25 08:32:26 +00:00
c.destroy_webdriver()
2022-06-07 16:41:58 +00:00
shutil.rmtree(Storage.TMP_FOLDER)
2021-06-01 09:00:40 +00:00
2022-02-25 15:09:35 +00:00
2022-02-23 08:57:04 +00:00
if __name__ == '__main__':
main()