kopia lustrzana https://github.com/bellingcat/auto-archiver
Add header argument; set up webdriver
rodzic
09dc5b5b81
commit
63a2847ac9
|
@ -43,6 +43,7 @@ class Archiver(ABC):
|
|||
def get_netloc(self, url):
|
||||
return urlparse(url).netloc
|
||||
|
||||
|
||||
def get_key(self, filename):
|
||||
"""
|
||||
returns a key in the format "[archiverName]_[filename]" includes extension
|
||||
|
|
|
@ -55,8 +55,8 @@ class TelegramArchiver(Archiver):
|
|||
# extract duration from HTML
|
||||
duration = s.find_all('time')[0].contents[0]
|
||||
if ':' in duration:
|
||||
duration = float(duration.split(':')[0]) * 60
|
||||
+ float(duration.split(':')[1])
|
||||
duration = float(duration.split(
|
||||
':')[0]) * 60 + float(duration.split(':')[1])
|
||||
else:
|
||||
duration = float(duration)
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ import shutil
|
|||
import gspread
|
||||
from loguru import logger
|
||||
from dotenv import load_dotenv
|
||||
from selenium import webdriver
|
||||
|
||||
import archivers
|
||||
from storages import S3Storage, S3Config
|
||||
|
@ -27,13 +28,17 @@ def update_sheet(gw, row, result: archivers.ArchiveResult):
|
|||
|
||||
batch_if_valid('archive', result.cdn_url)
|
||||
batch_if_valid('date', True, datetime.datetime.now().isoformat())
|
||||
batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
|
||||
batch_if_valid('thumbnail', result.thumbnail,
|
||||
f'=IMAGE("{result.thumbnail}")')
|
||||
batch_if_valid('thumbnail_index', result.thumbnail_index)
|
||||
batch_if_valid('title', result.title)
|
||||
batch_if_valid('duration', result.duration, str(result.duration))
|
||||
batch_if_valid('screenshot', result.screenshot)
|
||||
batch_if_valid('hash', result.hash)
|
||||
|
||||
if result.timestamp and type(result.timestamp) != str:
|
||||
result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat()
|
||||
result.timestamp = datetime.datetime.fromtimestamp(
|
||||
result.timestamp).isoformat()
|
||||
batch_if_valid('timestamp', result.timestamp)
|
||||
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
@ -50,7 +55,7 @@ def expand_url(url):
|
|||
return url
|
||||
|
||||
|
||||
def process_sheet(sheet):
|
||||
def process_sheet(sheet, header=1):
|
||||
gc = gspread.service_account(filename='service_account.json')
|
||||
sh = gc.open(sheet)
|
||||
|
||||
|
@ -61,73 +66,97 @@ def process_sheet(sheet):
|
|||
secret=os.getenv('DO_SPACES_SECRET')
|
||||
)
|
||||
|
||||
driver = webdriver.Firefox()
|
||||
driver.set_window_size(1400, 2000)
|
||||
|
||||
# loop through worksheets to check
|
||||
for ii, wks in enumerate(sh.worksheets()):
|
||||
logger.info(f'Opening worksheet {ii}: "{wks.title}"')
|
||||
gw = GWorksheet(wks)
|
||||
gw = GWorksheet(wks, header_row=header)
|
||||
|
||||
if not gw.col_exists('url'):
|
||||
logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
|
||||
logger.warning(
|
||||
f'No "Media URL" column found, skipping worksheet {wks.title}')
|
||||
continue
|
||||
|
||||
if not gw.col_exists('status'):
|
||||
logger.warning(f'No "Archive status" column found, skipping worksheet {wks.title}')
|
||||
logger.warning(
|
||||
f'No "Archive status" column found, skipping worksheet {wks.title}')
|
||||
continue
|
||||
|
||||
# archives will be in a folder 'doc_name/worksheet_name'
|
||||
s3_config.folder = f'{sheet}/{wks.title}/'
|
||||
s3_config.folder = f'{sheet.replace(" ", "_")}/{wks.title.replace(" ", "_")}/'
|
||||
s3_client = S3Storage(s3_config)
|
||||
|
||||
# order matters, first to succeed excludes remaining
|
||||
active_archivers = [
|
||||
archivers.TelegramArchiver(s3_client),
|
||||
archivers.TiktokArchiver(s3_client),
|
||||
archivers.YoutubeDLArchiver(s3_client),
|
||||
archivers.WaybackArchiver(s3_client)
|
||||
archivers.TelegramArchiver(s3_client, driver),
|
||||
archivers.TiktokArchiver(s3_client, driver),
|
||||
archivers.YoutubeDLArchiver(s3_client, driver),
|
||||
archivers.TwitterArchiver(s3_client, driver),
|
||||
archivers.WaybackArchiver(s3_client, driver)
|
||||
]
|
||||
|
||||
values = gw.get_values()
|
||||
# loop through rows in worksheet
|
||||
for row in range(2, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url')
|
||||
status = gw.get_cell(row, 'status')
|
||||
for row in range(1 + header, gw.count_rows() + 1):
|
||||
row_values = values[row-1]
|
||||
url = gw.get_cell(row_values, 'url')
|
||||
status = gw.get_cell(row_values, 'status')
|
||||
if url != '' and status in ['', None]:
|
||||
gw.set_cell(row, 'status', 'Archive in progress')
|
||||
url = gw.get_cell(row, 'url')
|
||||
status = gw.get_cell(status, 'status')
|
||||
|
||||
url = expand_url(url)
|
||||
if url != '' and status in ['', None]:
|
||||
gw.set_cell(row, 'status', 'Archive in progress')
|
||||
|
||||
for archiver in active_archivers:
|
||||
logger.debug(f'Trying {archiver} on row {row}')
|
||||
url = expand_url(url)
|
||||
|
||||
# TODO: add support for multiple videos/images
|
||||
try:
|
||||
result = archiver.download(url, check_if_exists=True)
|
||||
except Exception as e:
|
||||
result = False
|
||||
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}')
|
||||
for archiver in active_archivers:
|
||||
logger.debug(f'Trying {archiver} on row {row}')
|
||||
|
||||
# TODO: add support for multiple videos/images
|
||||
# try:
|
||||
result = archiver.download(
|
||||
url, check_if_exists=True)
|
||||
# except Exception as e:
|
||||
# result = False
|
||||
# logger.error(
|
||||
# f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}')
|
||||
|
||||
if result:
|
||||
if result.status in ['success', 'already archived']:
|
||||
result.status = archiver.name + \
|
||||
": " + str(result.status)
|
||||
logger.success(
|
||||
f'{archiver} succeeded on row {row}')
|
||||
break
|
||||
logger.warning(
|
||||
f'{archiver} did not succeed on row {row}, final status: {result.status}')
|
||||
result.status = archiver.name + \
|
||||
": " + str(result.status)
|
||||
|
||||
if result:
|
||||
if result.status in ['success', 'already archived']:
|
||||
logger.success(f'{archiver} succeeded on row {row}')
|
||||
break
|
||||
logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
|
||||
update_sheet(gw, row, result)
|
||||
else:
|
||||
gw.set_cell(row, 'status', 'failed: no archiver')
|
||||
|
||||
if result:
|
||||
update_sheet(gw, row, result)
|
||||
else:
|
||||
gw.set_cell(row, 'status', 'failed: no archiver')
|
||||
driver.quit()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Automatically archive social media videos from a Google Sheets document')
|
||||
parser.add_argument('--sheet', action='store', dest='sheet')
|
||||
parser.add_argument('--header', action='store', dest='header', default=1, type=int)
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info(f'Opening document {args.sheet}')
|
||||
|
||||
mkdir_if_not_exists('tmp')
|
||||
process_sheet(args.sheet)
|
||||
process_sheet(args.sheet, header=args.header)
|
||||
shutil.rmtree('tmp')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
@ -3,7 +3,7 @@ from gspread import utils
|
|||
|
||||
class GWorksheet:
|
||||
COLUMN_NAMES = {
|
||||
'url': 'media url',
|
||||
'url': 'link',
|
||||
'archive': 'archive location',
|
||||
'date': 'archive date',
|
||||
'status': 'archive status',
|
||||
|
@ -11,12 +11,14 @@ class GWorksheet:
|
|||
'thumbnail_index': 'thumbnail index',
|
||||
'timestamp': 'upload timestamp',
|
||||
'title': 'upload title',
|
||||
'duration': 'duration'
|
||||
'duration': 'duration',
|
||||
'screenshot': 'screenshot',
|
||||
'hash': 'hash'
|
||||
}
|
||||
|
||||
def __init__(self, worksheet, columns=COLUMN_NAMES):
|
||||
def __init__(self, worksheet, columns=COLUMN_NAMES, header_row=1):
|
||||
self.wks = worksheet
|
||||
self.headers = [v.lower() for v in self.wks.row_values(1)]
|
||||
self.headers = [v.lower() for v in self.wks.row_values(header_row)]
|
||||
self.columns = columns
|
||||
|
||||
def _check_col_exists(self, col: str):
|
||||
|
@ -38,6 +40,9 @@ class GWorksheet:
|
|||
# row is 1-based
|
||||
return self.wks.row_values(row)
|
||||
|
||||
def get_values(self):
|
||||
return self.wks.get_values()
|
||||
|
||||
def get_cell(self, row, col: str):
|
||||
"""
|
||||
returns the cell value from (row, col),
|
||||
|
|
Ładowanie…
Reference in New Issue