kopia lustrzana https://github.com/bellingcat/auto-archiver
extracted worksheet operations
rodzic
e4603a9423
commit
2d145802b5
|
@ -17,7 +17,7 @@ class TiktokArchiver(Archiver):
|
||||||
try:
|
try:
|
||||||
info = tiktok_downloader.info_post(url)
|
info = tiktok_downloader.info_post(url)
|
||||||
key = self.get_key(f'{info.id}.mp4')
|
key = self.get_key(f'{info.id}.mp4')
|
||||||
cdn_url = self.get_cdn_url(key)
|
cdn_url = self.storage.get_cdn_url(key)
|
||||||
filename = 'tmp/' + key
|
filename = 'tmp/' + key
|
||||||
|
|
||||||
if check_if_exists and self.storage.exists(key):
|
if check_if_exists and self.storage.exists(key):
|
||||||
|
|
194
auto_archive.py
194
auto_archive.py
|
@ -1,7 +1,6 @@
|
||||||
import os
|
import os
|
||||||
import datetime
|
import datetime
|
||||||
import argparse
|
import argparse
|
||||||
import math
|
|
||||||
import requests
|
import requests
|
||||||
import gspread
|
import gspread
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
@ -9,96 +8,34 @@ from dotenv import load_dotenv
|
||||||
|
|
||||||
import archivers
|
import archivers
|
||||||
from storages import S3Storage, S3Config
|
from storages import S3Storage, S3Config
|
||||||
|
from gworksheet import GWorksheet
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
def col_to_index(col):
|
def update_sheet(gw, row, result: archivers.ArchiveResult):
|
||||||
col = list(col)
|
|
||||||
ndigits = len(col)
|
|
||||||
alphabet = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
|
||||||
v = 0
|
|
||||||
i = ndigits - 1
|
|
||||||
|
|
||||||
for digit in col:
|
|
||||||
index = alphabet.find(digit)
|
|
||||||
v += (26 ** i) * index
|
|
||||||
i -= 1
|
|
||||||
|
|
||||||
return v - 1
|
|
||||||
|
|
||||||
|
|
||||||
def index_to_col(index):
|
|
||||||
alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
|
||||||
|
|
||||||
if index > 25:
|
|
||||||
t = index
|
|
||||||
dig = 0
|
|
||||||
while t > 25:
|
|
||||||
t = math.floor(t / 26)
|
|
||||||
dig += 1
|
|
||||||
return alphabet[t - 1] + index_to_col(index - t * int(math.pow(26, dig)))
|
|
||||||
else:
|
|
||||||
return alphabet[index]
|
|
||||||
|
|
||||||
|
|
||||||
def update_sheet(wks, row, result: archivers.ArchiveResult, columns, v):
|
|
||||||
update = []
|
update = []
|
||||||
|
|
||||||
if columns['status'] is not None:
|
def batch_if_valid(col, val, final_value=None):
|
||||||
update += [{
|
final_value = final_value or val
|
||||||
'range': columns['status'] + str(row),
|
if val and gw.col_exists(col) and gw.cell(row, col) == '':
|
||||||
'values': [[result.status]]
|
update.append((row, col, final_value))
|
||||||
}]
|
|
||||||
|
|
||||||
if result.cdn_url and columns['archive'] is not None and v[col_to_index(columns['archive'])] == '':
|
update.append((row, 'status', result.status))
|
||||||
update += [{
|
|
||||||
'range': columns['archive'] + str(row),
|
|
||||||
'values': [[result.cdn_url]]
|
|
||||||
}]
|
|
||||||
|
|
||||||
if columns['date'] is not None and v[col_to_index(columns['date'])] == '':
|
batch_if_valid('archive', result.cdn_url)
|
||||||
update += [{
|
batch_if_valid('archive', True, datetime.datetime.now().isoformat())
|
||||||
'range': columns['date'] + str(row),
|
batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
|
||||||
'values': [[datetime.datetime.now().isoformat()]]
|
batch_if_valid('thumbnail_index', result.thumbnail_index)
|
||||||
}]
|
batch_if_valid('title', result.title)
|
||||||
|
batch_if_valid('duration', result.duration, str(result.duration))
|
||||||
|
|
||||||
if result.thumbnail and columns['thumbnail'] is not None and v[col_to_index(columns['thumbnail'])] == '':
|
if result.timestamp and type(result.timestamp) != str:
|
||||||
update += [{
|
result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat()
|
||||||
'range': columns['thumbnail'] + str(row),
|
batch_if_valid('timestamp', result.timestamp)
|
||||||
'values': [['=IMAGE("' + result.thumbnail + '")']]
|
|
||||||
}]
|
|
||||||
|
|
||||||
if result.thumbnail_index and columns['thumbnail_index'] is not None and v[col_to_index(columns['thumbnail_index'])] == '':
|
gw.update_batch(update)
|
||||||
update += [{
|
|
||||||
'range': columns['thumbnail_index'] + str(row),
|
|
||||||
'values': [[result.thumbnail_index]]
|
|
||||||
}]
|
|
||||||
|
|
||||||
if result.timestamp and columns['timestamp'] is not None and v[col_to_index(columns['timestamp'])] == '':
|
|
||||||
update += [{
|
|
||||||
'range': columns['timestamp'] + str(row),
|
|
||||||
'values': [[result.timestamp]] if type(result.timestamp) == str else [[datetime.datetime.fromtimestamp(result.timestamp).isoformat()]]
|
|
||||||
}]
|
|
||||||
|
|
||||||
if result.title and columns['title'] is not None and v[col_to_index(columns['title'])] == '':
|
|
||||||
update += [{
|
|
||||||
'range': columns['title'] + str(row),
|
|
||||||
'values': [[result.title]]
|
|
||||||
}]
|
|
||||||
|
|
||||||
if result.duration and columns['duration'] is not None and v[col_to_index(columns['duration'])] == '':
|
|
||||||
update += [{
|
|
||||||
'range': columns['duration'] + str(row),
|
|
||||||
'values': [[str(result.duration)]]
|
|
||||||
}]
|
|
||||||
|
|
||||||
wks.batch_update(update, value_input_option='USER_ENTERED')
|
|
||||||
|
|
||||||
|
|
||||||
# def record_stream(url, s3_client, wks, i, columns, v):
|
|
||||||
# video_data, status = download_vid(url, s3_client)
|
|
||||||
# update_sheet(wks, i, status, video_data, columns, v)
|
|
||||||
|
|
||||||
|
|
||||||
def process_sheet(sheet):
|
def process_sheet(sheet):
|
||||||
|
@ -112,53 +49,19 @@ def process_sheet(sheet):
|
||||||
secret=os.getenv('DO_SPACES_SECRET')
|
secret=os.getenv('DO_SPACES_SECRET')
|
||||||
)
|
)
|
||||||
|
|
||||||
# s3_client = boto3.client('s3',
|
|
||||||
# region_name=os.getenv('DO_SPACES_REGION'),
|
|
||||||
# endpoint_url='https://{}.digitaloceanspaces.com'.format(
|
|
||||||
# os.getenv('DO_SPACES_REGION')),
|
|
||||||
# aws_access_key_id=os.getenv('DO_SPACES_KEY'),
|
|
||||||
# aws_secret_access_key=os.getenv('DO_SPACES_SECRET'))
|
|
||||||
|
|
||||||
# loop through worksheets to check
|
# loop through worksheets to check
|
||||||
for ii, wks in enumerate(sh.worksheets()):
|
for ii, wks in enumerate(sh.worksheets()):
|
||||||
logger.info(f'Opening worksheet {ii}: "{wks.title}"')
|
logger.info(f'Opening worksheet {ii}: "{wks.title}"')
|
||||||
values = wks.get_all_values()
|
gw = GWorksheet(wks)
|
||||||
|
|
||||||
headers = [v.lower() for v in values[0]]
|
if not gw.col_exists("url"):
|
||||||
columns = {}
|
|
||||||
|
|
||||||
columns['url'] = index_to_col(headers.index(
|
|
||||||
'media url')) if 'media url' in headers else index_to_col(headers.index(
|
|
||||||
'source url')) if 'source url' in headers else None
|
|
||||||
|
|
||||||
if columns['url'] is None:
|
|
||||||
logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
|
logger.warning(f'No "Media URL" column found, skipping worksheet {wks.title}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
url_index = col_to_index(columns['url'])
|
if not gw.col_exists("status"):
|
||||||
|
|
||||||
columns['archive'] = index_to_col(headers.index(
|
|
||||||
'archive location')) if 'archive location' in headers else None
|
|
||||||
columns['date'] = index_to_col(headers.index(
|
|
||||||
'archive date')) if 'archive date' in headers else None
|
|
||||||
columns['status'] = index_to_col(headers.index(
|
|
||||||
'archive status')) if 'archive status' in headers else None
|
|
||||||
|
|
||||||
if columns['status'] is None:
|
|
||||||
logger.warning("No 'Archive status' column found, skipping")
|
logger.warning("No 'Archive status' column found, skipping")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
columns['thumbnail'] = index_to_col(headers.index(
|
|
||||||
'thumbnail')) if 'thumbnail' in headers else None
|
|
||||||
columns['thumbnail_index'] = index_to_col(headers.index(
|
|
||||||
'thumbnail index')) if 'thumbnail index' in headers else None
|
|
||||||
columns['timestamp'] = index_to_col(headers.index(
|
|
||||||
'upload timestamp')) if 'upload timestamp' in headers else None
|
|
||||||
columns['title'] = index_to_col(headers.index(
|
|
||||||
'upload title')) if 'upload title' in headers else None
|
|
||||||
columns['duration'] = index_to_col(headers.index(
|
|
||||||
'duration')) if 'duration' in headers else None
|
|
||||||
|
|
||||||
# archives will be in a folder 'doc_name/worksheet_name'
|
# archives will be in a folder 'doc_name/worksheet_name'
|
||||||
s3_config.folder = f'{sheet}/{wks.title}/'
|
s3_config.folder = f'{sheet}/{wks.title}/'
|
||||||
s3_client = S3Storage(s3_config)
|
s3_client = S3Storage(s3_config)
|
||||||
|
@ -172,47 +75,42 @@ def process_sheet(sheet):
|
||||||
]
|
]
|
||||||
|
|
||||||
# loop through rows in worksheet
|
# loop through rows in worksheet
|
||||||
for i in range(2, len(values) + 1):
|
for i in range(2, gw.count_rows() + 1):
|
||||||
v = values[i - 1]
|
row = gw.get_row(i)
|
||||||
url = v[url_index]
|
url = gw.cell(row, 'url')
|
||||||
|
status = gw.cell(row, 'status')
|
||||||
|
if url != '' and status in ['', None]:
|
||||||
|
gw.update(i, 'status', 'Archive in progress')
|
||||||
|
|
||||||
if url != "" and v[col_to_index(columns['status'])] == "":
|
# expand short URL links
|
||||||
latest_val = wks.acell(columns['status'] + str(i)).value
|
if 'https://t.co/' in url:
|
||||||
|
r = requests.get(url)
|
||||||
|
url = r.url
|
||||||
|
|
||||||
# check so we don't step on each others' toes
|
for archiver in active_archivers:
|
||||||
if latest_val == '' or latest_val is None:
|
logger.debug(f'Trying {archiver} on row {i}')
|
||||||
wks.update(columns['status'] + str(i), 'Archive in progress')
|
result = archiver.download(url, check_if_exists=True)
|
||||||
|
|
||||||
# expand short URL links
|
|
||||||
if 'https://t.co/' in url:
|
|
||||||
r = requests.get(url)
|
|
||||||
url = r.url
|
|
||||||
|
|
||||||
for archiver in active_archivers:
|
|
||||||
logger.debug(f"Trying {archiver} on row {i}")
|
|
||||||
|
|
||||||
result = archiver.download(url, check_if_exists=True)
|
|
||||||
|
|
||||||
if result:
|
|
||||||
logger.success(f"{archiver} succeeded on row {i}")
|
|
||||||
break
|
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
update_sheet(wks, i, result, columns, v)
|
logger.success(f'{archiver} succeeded on row {i}')
|
||||||
else:
|
break
|
||||||
wks.update(columns['status'] + str(i), 'failed: no archiver')
|
|
||||||
|
|
||||||
# except:
|
if result:
|
||||||
# if any unexpected errors occured, log these into the Google Sheet
|
update_sheet(gw, i, result)
|
||||||
# t, value, traceback = sys.exc_info()
|
else:
|
||||||
|
gw.update(i, 'status', 'failed: no archiver')
|
||||||
|
|
||||||
# update_sheet(wks, i, str(
|
# # except:
|
||||||
# value), {}, columns, v)
|
# # if any unexpected errors occured, log these into the Google Sheet
|
||||||
|
# # t, value, traceback = sys.exc_info()
|
||||||
|
|
||||||
|
# # update_sheet(wks, i, str(
|
||||||
|
# # value), {}, columns, v)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Automatically archive social media videos from a Google Sheet")
|
description="Automatically archive social media videos from a Google Sheets document")
|
||||||
parser.add_argument("--sheet", action="store", dest="sheet")
|
parser.add_argument("--sheet", action="store", dest="sheet")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,97 @@
|
||||||
|
from gspread import utils
|
||||||
|
|
||||||
|
|
||||||
|
class GWorksheet:
|
||||||
|
COLUMN_NAMES = {
|
||||||
|
'url': 'media url',
|
||||||
|
'archive': 'archive location',
|
||||||
|
'date': 'archive date',
|
||||||
|
'status': 'archive status',
|
||||||
|
'thumbnail': 'thumbnail',
|
||||||
|
'thumbnail_index': 'thumbnail index',
|
||||||
|
'timestamp': 'upload timestamp',
|
||||||
|
'title': 'upload title',
|
||||||
|
'duration': 'duration'
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, worksheet, columns=COLUMN_NAMES):
|
||||||
|
self.wks = worksheet
|
||||||
|
self.headers = [v.lower() for v in self.wks.row_values(1)]
|
||||||
|
self.columns = columns
|
||||||
|
|
||||||
|
def worksheet(self): return self.wks
|
||||||
|
|
||||||
|
def _check_col_exists(self, col: str):
|
||||||
|
if col not in self.columns:
|
||||||
|
raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
|
||||||
|
|
||||||
|
def col_exists(self, col: str):
|
||||||
|
self._check_col_exists(col)
|
||||||
|
return self.columns[col] in self.headers
|
||||||
|
|
||||||
|
def col_index(self, col: str):
|
||||||
|
self._check_col_exists(col)
|
||||||
|
return self.headers.index(self.columns[col])
|
||||||
|
|
||||||
|
def count_rows(self):
|
||||||
|
return len(self.wks.get_values())
|
||||||
|
|
||||||
|
def get_row(self, row: int):
|
||||||
|
# row is 1-based
|
||||||
|
return self.wks.row_values(row)
|
||||||
|
|
||||||
|
def cell(self, row, col: str):
|
||||||
|
# row can be index (1-based) or list of values
|
||||||
|
if type(row) == int:
|
||||||
|
row = self.get_row(row)
|
||||||
|
|
||||||
|
col_index = self.col_index(col)
|
||||||
|
if col_index >= len(row):
|
||||||
|
return ''
|
||||||
|
return row[col_index]
|
||||||
|
|
||||||
|
def update(self, row: int, col: str, val):
|
||||||
|
# row is 1-based
|
||||||
|
col_index = self.col_index(col) + 1
|
||||||
|
self.wks.update_cell(row, col_index, val)
|
||||||
|
|
||||||
|
def update_batch(self, updates):
|
||||||
|
updates = [
|
||||||
|
{
|
||||||
|
'range': self.to_a1(row, self.col_index(col) + 1),
|
||||||
|
'values': [[val]]
|
||||||
|
}
|
||||||
|
for row, col, val in updates
|
||||||
|
]
|
||||||
|
self.wks.batch_update(updates, value_input_option='USER_ENTERED')
|
||||||
|
|
||||||
|
def to_a1(self, row: int, col: int):
|
||||||
|
# row, col are 1-based
|
||||||
|
return utils.rowcol_to_a1(row, col)
|
||||||
|
|
||||||
|
# def index_to_col(self, index):
|
||||||
|
# alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||||
|
|
||||||
|
# if index > 25:
|
||||||
|
# t = index
|
||||||
|
# dig = 0
|
||||||
|
# while t > 25:
|
||||||
|
# t = math.floor(t / 26)
|
||||||
|
# dig += 1
|
||||||
|
# return alphabet[t - 1] + self.index_to_col(index - t * int(math.pow(26, dig)))
|
||||||
|
# else:
|
||||||
|
# return alphabet[index]
|
||||||
|
|
||||||
|
# def col_to_index(self, col):
|
||||||
|
# col = list(col)
|
||||||
|
# ndigits = len(col)
|
||||||
|
# alphabet = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||||
|
# v = 0
|
||||||
|
# i = ndigits - 1
|
||||||
|
|
||||||
|
# for digit in col:
|
||||||
|
# index = alphabet.find(digit)
|
||||||
|
# v += (26 ** i) * index
|
||||||
|
# i -= 1
|
||||||
|
|
||||||
|
# return v - 1
|
Ładowanie…
Reference in New Issue