Add header argument; set up webdriver

pull/16/head
Logan Williams 2022-02-25 16:09:35 +01:00
rodzic 09dc5b5b81
commit 63a2847ac9
4 zmienionych plików z 74 dodań i 39 usunięć

Wyświetl plik

@ -43,6 +43,7 @@ class Archiver(ABC):
def get_netloc(self, url):
return urlparse(url).netloc
def get_key(self, filename):
"""
returns a key in the format "[archiverName]_[filename]" includes extension

Wyświetl plik

@ -55,8 +55,8 @@ class TelegramArchiver(Archiver):
# extract duration from HTML
duration = s.find_all('time')[0].contents[0]
if ':' in duration:
duration = float(duration.split(':')[0]) * 60
+ float(duration.split(':')[1])
duration = float(duration.split(
':')[0]) * 60 + float(duration.split(':')[1])
else:
duration = float(duration)

Wyświetl plik

@ -6,6 +6,7 @@ import shutil
import gspread
from loguru import logger
from dotenv import load_dotenv
from selenium import webdriver
import archivers
from storages import S3Storage, S3Config
@ -27,13 +28,17 @@ def update_sheet(gw, row, result: archivers.ArchiveResult):
batch_if_valid('archive', result.cdn_url)
batch_if_valid('date', True, datetime.datetime.now().isoformat())
batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
batch_if_valid('thumbnail', result.thumbnail,
f'=IMAGE("{result.thumbnail}")')
batch_if_valid('thumbnail_index', result.thumbnail_index)
batch_if_valid('title', result.title)
batch_if_valid('duration', result.duration, str(result.duration))
batch_if_valid('screenshot', result.screenshot)
batch_if_valid('hash', result.hash)
if result.timestamp and type(result.timestamp) != str:
result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat()
result.timestamp = datetime.datetime.fromtimestamp(
result.timestamp).isoformat()
batch_if_valid('timestamp', result.timestamp)
gw.batch_set_cell(cell_updates)
@ -50,7 +55,7 @@ def expand_url(url):
return url
def process_sheet(sheet):
def process_sheet(sheet, header=1):
gc = gspread.service_account(filename='service_account.json')
sh = gc.open(sheet)
@ -61,73 +66,97 @@ def process_sheet(sheet):
secret=os.getenv('DO_SPACES_SECRET')
)
driver = webdriver.Firefox()
driver.set_window_size(1400, 2000)
# loop through worksheets to check
for ii, wks in enumerate(sh.worksheets()):
logger.info(f'Opening worksheet {ii}: "{wks.title}"')
gw = GWorksheet(wks)
gw = GWorksheet(wks, header_row=header)
if not gw.col_exists('url'):
logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
logger.warning(
f'No "Media URL" column found, skipping worksheet {wks.title}')
continue
if not gw.col_exists('status'):
logger.warning(f'No "Archive status" column found, skipping worksheet {wks.title}')
logger.warning(
f'No "Archive status" column found, skipping worksheet {wks.title}')
continue
# archives will be in a folder 'doc_name/worksheet_name'
s3_config.folder = f'{sheet}/{wks.title}/'
s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
s3_client = S3Storage(s3_config)
# order matters, first to succeed excludes remaining
active_archivers = [
archivers.TelegramArchiver(s3_client),
archivers.TiktokArchiver(s3_client),
archivers.YoutubeDLArchiver(s3_client),
archivers.WaybackArchiver(s3_client)
archivers.TelegramArchiver(s3_client, driver),
archivers.TiktokArchiver(s3_client, driver),
archivers.YoutubeDLArchiver(s3_client, driver),
archivers.TwitterArchiver(s3_client, driver),
archivers.WaybackArchiver(s3_client, driver)
]
values = gw.get_values()
# loop through rows in worksheet
for row in range(2, gw.count_rows() + 1):
url = gw.get_cell(row, 'url')
status = gw.get_cell(row, 'status')
for row in range(1 + header, gw.count_rows() + 1):
row_values = values[row-1]
url = gw.get_cell(row_values, 'url')
status = gw.get_cell(row_values, 'status')
if url != '' and status in ['', None]:
gw.set_cell(row, 'status', 'Archive in progress')
url = gw.get_cell(row, 'url')
status = gw.get_cell(status, 'status')
url = expand_url(url)
if url != '' and status in ['', None]:
gw.set_cell(row, 'status', 'Archive in progress')
for archiver in active_archivers:
logger.debug(f'Trying {archiver} on row {row}')
url = expand_url(url)
# TODO: add support for multiple videos/images
try:
result = archiver.download(url, check_if_exists=True)
except Exception as e:
result = False
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}')
for archiver in active_archivers:
logger.debug(f'Trying {archiver} on row {row}')
# TODO: add support for multiple videos/images
# try:
result = archiver.download(
url, check_if_exists=True)
# except Exception as e:
# result = False
# logger.error(
# f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}')
if result:
if result.status in ['success', 'already archived']:
result.status = archiver.name + \
": " + str(result.status)
logger.success(
f'{archiver} succeeded on row {row}')
break
logger.warning(
f'{archiver} did not succeed on row {row}, final status: {result.status}')
result.status = archiver.name + \
": " + str(result.status)
if result:
if result.status in ['success', 'already archived']:
logger.success(f'{archiver} succeeded on row {row}')
break
logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
update_sheet(gw, row, result)
else:
gw.set_cell(row, 'status', 'failed: no archiver')
if result:
update_sheet(gw, row, result)
else:
gw.set_cell(row, 'status', 'failed: no archiver')
driver.quit()
def main():
parser = argparse.ArgumentParser(
description='Automatically archive social media videos from a Google Sheets document')
parser.add_argument('--sheet', action='store', dest='sheet')
parser.add_argument('--header', action='store', dest='header', default=1, type=int)
args = parser.parse_args()
logger.info(f'Opening document {args.sheet}')
mkdir_if_not_exists('tmp')
process_sheet(args.sheet)
process_sheet(args.sheet, header=args.header)
shutil.rmtree('tmp')
if __name__ == '__main__':
main()

Wyświetl plik

@ -3,7 +3,7 @@ from gspread import utils
class GWorksheet:
COLUMN_NAMES = {
'url': 'media url',
'url': 'link',
'archive': 'archive location',
'date': 'archive date',
'status': 'archive status',
@ -11,12 +11,14 @@ class GWorksheet:
'thumbnail_index': 'thumbnail index',
'timestamp': 'upload timestamp',
'title': 'upload title',
'duration': 'duration'
'duration': 'duration',
'screenshot': 'screenshot',
'hash': 'hash'
}
def __init__(self, worksheet, columns=COLUMN_NAMES):
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
self.wks = worksheet
self.headers = [v.lower() for v in self.wks.row_values(1)]
self.headers = [v.lower() for v in self.wks.row_values(header_row)]
self.columns = columns
def _check_col_exists(self, col: str):
@ -38,6 +40,9 @@ class GWorksheet:
# row is 1-based
return self.wks.row_values(row)
def get_values(self):
return self.wks.get_values()
def get_cell(self, row, col: str):
"""
returns the cell value from (row, col),