kopia lustrzana https://github.com/bellingcat/auto-archiver
making code more resilient to exceptions
rodzic
644aa0811c
commit
9550cd509e
|
@ -54,8 +54,8 @@ class TelegramArchiver(Archiver):
|
|||
# extract duration from HTML
|
||||
duration = s.find_all('time')[0].contents[0]
|
||||
if ':' in duration:
|
||||
duration = float(duration.split(
|
||||
':')[0]) * 60 + float(duration.split(':')[1])
|
||||
duration = float(duration.split(':')[0]) * 60
|
||||
+ float(duration.split(':')[1])
|
||||
else:
|
||||
duration = float(duration)
|
||||
|
||||
|
|
|
@ -14,17 +14,18 @@ load_dotenv()
|
|||
|
||||
|
||||
def update_sheet(gw, row, result: archivers.ArchiveResult):
|
||||
update = []
|
||||
cell_updates = []
|
||||
row_values = gw.get_row(row)
|
||||
|
||||
def batch_if_valid(col, val, final_value=None):
|
||||
final_value = final_value or val
|
||||
if val and gw.col_exists(col) and gw.cell(row, col) == '':
|
||||
update.append((row, col, final_value))
|
||||
if val and gw.col_exists(col) and gw.get_cell(row_values, col) == '':
|
||||
cell_updates.append((row, col, final_value))
|
||||
|
||||
update.append((row, 'status', result.status))
|
||||
cell_updates.append((row, 'status', result.status))
|
||||
|
||||
batch_if_valid('archive', result.cdn_url)
|
||||
batch_if_valid('archive', True, datetime.datetime.now().isoformat())
|
||||
batch_if_valid('date', True, datetime.datetime.now().isoformat())
|
||||
batch_if_valid('thumbnail', result.thumbnail, f'=IMAGE("{result.thumbnail}")')
|
||||
batch_if_valid('thumbnail_index', result.thumbnail_index)
|
||||
batch_if_valid('title', result.title)
|
||||
|
@ -34,7 +35,18 @@ def update_sheet(gw, row, result: archivers.ArchiveResult):
|
|||
result.timestamp = datetime.datetime.fromtimestamp(result.timestamp).isoformat()
|
||||
batch_if_valid('timestamp', result.timestamp)
|
||||
|
||||
gw.update_batch(update)
|
||||
gw.batch_set_cell(cell_updates)
|
||||
|
||||
|
||||
def expand_url(url):
|
||||
# expand short URL links
|
||||
if 'https://t.co/' in url:
|
||||
try:
|
||||
r = requests.get(url)
|
||||
url = r.url
|
||||
except:
|
||||
logger.error(f'Failed to expand url {url}')
|
||||
return url
|
||||
|
||||
|
||||
def process_sheet(sheet):
|
||||
|
@ -74,38 +86,34 @@ def process_sheet(sheet):
|
|||
]
|
||||
|
||||
# loop through rows in worksheet
|
||||
for i in range(2, gw.count_rows() + 1):
|
||||
row = gw.get_row(i)
|
||||
url = gw.cell(row, 'url')
|
||||
status = gw.cell(row, 'status')
|
||||
for row in range(2, gw.count_rows() + 1):
|
||||
url = gw.get_cell(row, 'url')
|
||||
status = gw.get_cell(row, 'status')
|
||||
if url != '' and status in ['', None]:
|
||||
gw.update(i, 'status', 'Archive in progress')
|
||||
gw.set_cell(row, 'status', 'Archive in progress')
|
||||
|
||||
# expand short URL links
|
||||
if 'https://t.co/' in url:
|
||||
r = requests.get(url)
|
||||
url = r.url
|
||||
url = expand_url(url)
|
||||
|
||||
for archiver in active_archivers:
|
||||
logger.debug(f'Trying {archiver} on row {i}')
|
||||
logger.debug(f'Trying {archiver} on row {row}')
|
||||
|
||||
# TODO: add support for multiple videos/images
|
||||
result = archiver.download(url, check_if_exists=True)
|
||||
try:
|
||||
result = archiver.download(url, check_if_exists=True)
|
||||
except Exception as e:
|
||||
result = False
|
||||
logger.error(f'Got unexpected error in row {row} with archiver {archiver} for url {url}: {e}')
|
||||
|
||||
if result:
|
||||
logger.success(f'{archiver} succeeded on row {i}')
|
||||
break
|
||||
if result.status in ['success', 'already archived']:
|
||||
logger.success(f'{archiver} succeeded on row {row}')
|
||||
break
|
||||
logger.warning(f'{archiver} did not succeed on row {row}, final status: {result.status}')
|
||||
|
||||
if result:
|
||||
update_sheet(gw, i, result)
|
||||
update_sheet(gw, row, result)
|
||||
else:
|
||||
gw.update(i, 'status', 'failed: no archiver')
|
||||
|
||||
# # except:
|
||||
# # if any unexpected errors occured, log these into the Google Sheet
|
||||
# # t, value, traceback = sys.exc_info()
|
||||
|
||||
# # update_sheet(wks, i, str(
|
||||
# # value), {}, columns, v)
|
||||
gw.set_cell(row, 'status', 'failed: no archiver')
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
@ -19,20 +19,18 @@ class GWorksheet:
|
|||
self.headers = [v.lower() for v in self.wks.row_values(1)]
|
||||
self.columns = columns
|
||||
|
||||
def worksheet(self): return self.wks
|
||||
|
||||
def _check_col_exists(self, col: str):
|
||||
if col not in self.columns:
|
||||
raise Exception(f'Column {col} is not in the configured column names: {self.columns.keys()}')
|
||||
|
||||
def _col_index(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
return self.headers.index(self.columns[col])
|
||||
|
||||
def col_exists(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
return self.columns[col] in self.headers
|
||||
|
||||
def col_index(self, col: str):
|
||||
self._check_col_exists(col)
|
||||
return self.headers.index(self.columns[col])
|
||||
|
||||
def count_rows(self):
|
||||
return len(self.wks.get_values())
|
||||
|
||||
|
@ -40,30 +38,37 @@ class GWorksheet:
|
|||
# row is 1-based
|
||||
return self.wks.row_values(row)
|
||||
|
||||
def cell(self, row, col: str):
|
||||
# row can be index (1-based) or list of values
|
||||
def get_cell(self, row, col: str):
|
||||
"""
|
||||
returns the cell value from (row, col),
|
||||
where row can be an index (1-based) OR list of values
|
||||
as received from self.get_row(row)
|
||||
"""
|
||||
if type(row) == int:
|
||||
row = self.get_row(row)
|
||||
|
||||
col_index = self.col_index(col)
|
||||
col_index = self._col_index(col)
|
||||
if col_index >= len(row):
|
||||
return ''
|
||||
return row[col_index]
|
||||
|
||||
def update(self, row: int, col: str, val):
|
||||
def set_cell(self, row: int, col: str, val):
|
||||
# row is 1-based
|
||||
col_index = self.col_index(col) + 1
|
||||
col_index = self._col_index(col) + 1
|
||||
self.wks.update_cell(row, col_index, val)
|
||||
|
||||
def update_batch(self, updates):
|
||||
updates = [
|
||||
def batch_set_cell(self, cell_updates):
|
||||
"""
|
||||
receives a list of [(row:int, col:str, val)] and batch updates it, the parameters are the same as in the self.set_cell() method
|
||||
"""
|
||||
cell_updates = [
|
||||
{
|
||||
'range': self.to_a1(row, self.col_index(col) + 1),
|
||||
'range': self.to_a1(row, self._col_index(col) + 1),
|
||||
'values': [[val]]
|
||||
}
|
||||
for row, col, val in updates
|
||||
for row, col, val in cell_updates
|
||||
]
|
||||
self.wks.batch_update(updates, value_input_option='USER_ENTERED')
|
||||
self.wks.batch_update(cell_updates, value_input_option='USER_ENTERED')
|
||||
|
||||
def to_a1(self, row: int, col: int):
|
||||
# row, col are 1-based
|
||||
|
|
Ładowanie…
Reference in New Issue