2021-02-09 13:55:26 +00:00
|
|
|
import os
|
2022-02-21 13:19:09 +00:00
|
|
|
import datetime
|
2021-02-09 13:55:26 +00:00
|
|
|
import argparse
|
2022-02-22 15:03:35 +00:00
|
|
|
import requests
|
2022-02-21 13:19:09 +00:00
|
|
|
import gspread
|
2022-02-20 09:27:25 +00:00
|
|
|
from loguru import logger
|
2022-02-21 13:19:09 +00:00
|
|
|
from dotenv import load_dotenv
|
|
|
|
|
2022-02-20 09:27:25 +00:00
|
|
|
import archivers
|
2022-02-22 15:03:35 +00:00
|
|
|
from storages import S3Storage, S3Config
|
2022-02-23 15:24:59 +00:00
|
|
|
from utils import GWorksheet
|
2021-02-09 13:55:26 +00:00
|
|
|
|
|
|
|
load_dotenv()
|
|
|
|
|
2021-05-03 12:16:09 +00:00
|
|
|
|
2022-02-23 08:54:03 +00:00
|
|
|
def update_sheet(gw, row, result: archivers.ArchiveResult):
|
2022-02-23 12:57:11 +00:00
|
|
|
cell_updates = []
|
|
|
|
row_values = gw.get_row(row)
|
2021-05-03 12:16:09 +00:00
|
|
|
|
2022-02-23 08:54:03 +00:00
|
|
|
def batch_if_valid(col, val, final_value=None):
|
|
|
|
final_value = final_value or val
|
2022-02-23 12:57:11 +00:00
|
|
|
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
|
|
|
|
cell_updates.append((row, col, final_value))
|
2021-03-18 10:03:13 +00:00
|
|
|
|
2022-02-23 12:57:11 +00:00
|
|
|
cell_updates.append((row, 'status', result.status))
|
2021-03-15 09:08:02 +00:00
|
|
|
|
2022-02-23 08:54:03 +00:00
|
|
|
batch_if_valid('archive', result.cdn_url)
|
2022-02-23 12:57:11 +00:00
|
|
|
batch_if_valid('date', True, datetime.datetime.now().isoformat())
|
2022-02-23 08:54:03 +00:00
|
|
|
batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
|
|
|
|
batch_if_valid('thumbnail_index', result.thumbnail_index)
|
|
|
|
batch_if_valid('title', result.title)
|
|
|
|
batch_if_valid('duration', result.duration, str(result.duration))
|
2021-09-08 21:20:23 +00:00
|
|
|
|
2022-02-23 08:54:03 +00:00
|
|
|
if result.timestamp and type(result.timestamp) != str:
|
|
|
|
result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat()
|
|
|
|
batch_if_valid('timestamp', result.timestamp)
|
2021-03-15 09:08:02 +00:00
|
|
|
|
2022-02-23 12:57:11 +00:00
|
|
|
gw.batch_set_cell(cell_updates)
|
|
|
|
|
|
|
|
|
|
|
|
def expand_url(url):
|
|
|
|
# expand short URL links
|
|
|
|
if 'https://t.co/' in url:
|
|
|
|
try:
|
|
|
|
r = requests.get(url)
|
|
|
|
url = r.url
|
|
|
|
except:
|
|
|
|
logger.error(f'Failed to expand url {url}')
|
|
|
|
return url
|
2021-03-15 09:08:02 +00:00
|
|
|
|
2021-06-01 09:00:40 +00:00
|
|
|
|
2021-03-25 12:42:42 +00:00
|
|
|
def process_sheet(sheet):
|
2021-06-01 09:05:13 +00:00
|
|
|
gc = gspread.service_account(filename='service_account.json')
|
2021-03-25 12:42:42 +00:00
|
|
|
sh = gc.open(sheet)
|
2021-02-09 13:55:26 +00:00
|
|
|
|
2022-02-22 15:03:35 +00:00
|
|
|
s3_config = S3Config(
|
|
|
|
bucket=os.getenv('DO_BUCKET'),
|
|
|
|
region=os.getenv('DO_SPACES_REGION'),
|
|
|
|
key=os.getenv('DO_SPACES_KEY'),
|
|
|
|
secret=os.getenv('DO_SPACES_SECRET')
|
|
|
|
)
|
|
|
|
|
2021-02-09 13:55:26 +00:00
|
|
|
# loop through worksheets to check
|
2022-02-22 15:03:35 +00:00
|
|
|
for ii, wks in enumerate(sh.worksheets()):
|
|
|
|
logger.info(f'Opening worksheet {ii}: "{wks.title}"')
|
2022-02-23 08:54:03 +00:00
|
|
|
gw = GWorksheet(wks)
|
2021-03-15 09:08:02 +00:00
|
|
|
|
2022-02-23 08:57:04 +00:00
|
|
|
if not gw.col_exists('url'):
|
2022-02-22 15:03:35 +00:00
|
|
|
logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
|
2021-03-25 12:42:42 +00:00
|
|
|
continue
|
|
|
|
|
2022-02-23 08:57:04 +00:00
|
|
|
if not gw.col_exists('status'):
|
2022-02-20 09:27:25 +00:00
|
|
|
logger.warning("No 'Archive status' column found, skipping")
|
2021-03-25 12:42:42 +00:00
|
|
|
continue
|
|
|
|
|
2022-02-22 15:03:35 +00:00
|
|
|
# archives will be in a folder 'doc_name/worksheet_name'
|
|
|
|
s3_config.folder = f'{sheet}/{wks.title}/'
|
|
|
|
s3_client = S3Storage(s3_config)
|
2022-02-20 09:27:25 +00:00
|
|
|
|
2022-02-21 13:19:09 +00:00
|
|
|
# order matters, first to succeed excludes remaining
|
2022-02-20 09:27:25 +00:00
|
|
|
active_archivers = [
|
|
|
|
archivers.TelegramArchiver(s3_client),
|
|
|
|
archivers.TiktokArchiver(s3_client),
|
|
|
|
archivers.YoutubeDLArchiver(s3_client),
|
|
|
|
archivers.WaybackArchiver(s3_client)
|
|
|
|
]
|
|
|
|
|
2021-02-09 13:55:26 +00:00
|
|
|
# loop through rows in worksheet
|
2022-02-23 12:57:11 +00:00
|
|
|
for row in range(2, gw.count_rows() + 1):
|
|
|
|
url = gw.get_cell(row, 'url')
|
|
|
|
status = gw.get_cell(row, 'status')
|
2022-02-23 08:54:03 +00:00
|
|
|
if url != '' and status in ['', None]:
|
2022-02-23 12:57:11 +00:00
|
|
|
gw.set_cell(row, 'status', 'Archive in progress')
|
2022-02-23 08:54:03 +00:00
|
|
|
|
2022-02-23 12:57:11 +00:00
|
|
|
url = expand_url(url)
|
2022-02-23 08:54:03 +00:00
|
|
|
|
|
|
|
for archiver in active_archivers:
|
2022-02-23 12:57:11 +00:00
|
|
|
logger.debug(f'Trying {archiver} on row {row}')
|
|
|
|
|
2022-02-23 08:57:44 +00:00
|
|
|
# TODO: add support for multiple videos/images
|
2022-02-23 12:57:11 +00:00
|
|
|
try:
|
|
|
|
result = archiver.download(url, check_if_exists=True)
|
|
|
|
except Exception as e:
|
|
|
|
result = False
|
|
|
|
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}')
|
2022-02-20 09:27:25 +00:00
|
|
|
|
|
|
|
if result:
|
2022-02-23 12:57:11 +00:00
|
|
|
if result.status in ['success', 'already archived']:
|
|
|
|
logger.success(f'{archiver} succeeded on row {row}')
|
|
|
|
break
|
|
|
|
logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
|
2022-02-23 08:54:03 +00:00
|
|
|
|
|
|
|
if result:
|
2022-02-23 12:57:11 +00:00
|
|
|
update_sheet(gw, row, result)
|
2022-02-23 08:54:03 +00:00
|
|
|
else:
|
2022-02-23 12:57:11 +00:00
|
|
|
gw.set_cell(row, 'status', 'failed: no archiver')
|
2021-03-15 09:08:02 +00:00
|
|
|
|
2021-02-09 13:55:26 +00:00
|
|
|
|
2021-03-25 12:42:42 +00:00
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser(
|
2022-02-23 08:57:04 +00:00
|
|
|
description='Automatically archive social media videos from a Google Sheets document')
|
|
|
|
parser.add_argument('--sheet', action='store', dest='sheet')
|
2021-03-25 12:42:42 +00:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
2022-02-23 08:57:04 +00:00
|
|
|
logger.info(f'Opening document {args.sheet}')
|
2021-03-25 12:42:42 +00:00
|
|
|
|
|
|
|
process_sheet(args.sheet)
|
2021-03-15 09:08:02 +00:00
|
|
|
|
2021-06-01 09:00:40 +00:00
|
|
|
|
2022-02-23 08:57:04 +00:00
|
|
|
if __name__ == '__main__':
|
2021-02-09 13:55:26 +00:00
|
|
|
main()
|