kopia lustrzana https://github.com/bellingcat/auto-archiver
retry mechanism
rodzic
6dcb59fea6
commit
067e6d8954
|
@ -1,7 +1,8 @@
|
|||
import os, datetime, shutil, hashlib, time, requests
|
||||
import os, datetime, shutil, hashlib, time, requests, re
|
||||
from dataclasses import dataclass
|
||||
from abc import ABC, abstractmethod
|
||||
from urllib.parse import urlparse
|
||||
from random import randrange
|
||||
|
||||
import ffmpeg
|
||||
from loguru import logger
|
||||
|
@ -27,6 +28,7 @@ class ArchiveResult:
|
|||
|
||||
class Archiver(ABC):
|
||||
name = "default"
|
||||
retry_regex = r"retrying at (\d+)$"
|
||||
|
||||
def __init__(self, storage: Storage, driver):
|
||||
self.storage = storage
|
||||
|
@ -95,7 +97,7 @@ class Archiver(ABC):
|
|||
key = self.get_key(path.replace("/", "_"))
|
||||
if '.' not in path:
|
||||
key += '.jpg'
|
||||
|
||||
|
||||
filename = os.path.join(Storage.TMP_FOLDER, key)
|
||||
|
||||
d = requests.get(media_url, headers=headers)
|
||||
|
@ -226,3 +228,38 @@ class Archiver(ABC):
|
|||
thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
|
||||
|
||||
return (key_thumb, thumb_index_cdn_url)
|
||||
|
||||
def signal_retry_in(self, min_seconds=1800, max_seconds=7200):
|
||||
"""
|
||||
sets state to retry in random between (min_seconds, max_seconds)
|
||||
"""
|
||||
now = datetime.datetime.now().timestamp()
|
||||
retry_at = int(now + randrange(min_seconds, max_seconds))
|
||||
logger.debug(f"signaling {retry_at=}")
|
||||
return ArchiveResult(status=f'retrying at {retry_at}')
|
||||
|
||||
def is_retry(status):
|
||||
return re.search(Archiver.retry_regex, status) is not None
|
||||
|
||||
def should_retry_from_status(status):
|
||||
"""
|
||||
checks status against message in signal_retry_in
|
||||
returns true if enough time has elapsed, false otherwise
|
||||
"""
|
||||
match = re.search(Archiver.retry_regex, status)
|
||||
if match:
|
||||
retry_at = int(match.group(1))
|
||||
now = datetime.datetime.now().timestamp()
|
||||
should_retry = now >= retry_at
|
||||
logger.debug(f"{should_retry=} as {now=} >= {retry_at=}")
|
||||
return should_retry
|
||||
return False
|
||||
|
||||
def remove_retry(status):
|
||||
"""
|
||||
transforms the status from retry into something else
|
||||
"""
|
||||
new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0)
|
||||
logger.debug(f"removing retry message at {status=}, got {new_status=}")
|
||||
return new_status
|
||||
|
||||
|
|
|
@ -63,7 +63,8 @@ class WaybackArchiver(Archiver):
|
|||
|
||||
status_json = status_r.json()
|
||||
if status_json['status'] != 'success':
|
||||
# TODO: if "please try again" in str(status_json).lower() then this can be retried in the future
|
||||
if "please try again" in str(status_json).lower():
|
||||
return self.signal_retry_in()
|
||||
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
|
||||
|
||||
archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
import os, datetime, shutil, traceback
|
||||
import os, datetime, shutil, traceback, random
|
||||
|
||||
from loguru import logger
|
||||
from slugify import slugify
|
||||
|
||||
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult
|
||||
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult, Archiver
|
||||
from utils import GWorksheet, mkdir_if_not_exists, expand_url
|
||||
from configs import Config
|
||||
from storages import Storage
|
||||
|
||||
random.seed()
|
||||
|
||||
|
||||
def update_sheet(gw, row, result: ArchiveResult):
|
||||
cell_updates = []
|
||||
|
@ -72,7 +74,10 @@ def process_sheet(c: Config):
|
|||
original_status = gw.get_cell(row, 'status')
|
||||
status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
|
||||
|
||||
if url == '' or status not in ['', None]: continue
|
||||
is_retry = False
|
||||
if url == '' or status not in ['', None]:
|
||||
is_retry = Archiver.should_retry_from_status(status)
|
||||
if not is_retry: continue
|
||||
|
||||
# All checks done - archival process starts here
|
||||
gw.set_cell(row, 'status', 'Archive in progress')
|
||||
|
@ -85,9 +90,9 @@ def process_sheet(c: Config):
|
|||
# order matters, first to succeed excludes remaining
|
||||
active_archivers = [
|
||||
TelethonArchiver(storage, c.webdriver, c.telegram_config),
|
||||
TelegramArchiver(storage, c.webdriver),
|
||||
TiktokArchiver(storage, c.webdriver),
|
||||
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
|
||||
TelegramArchiver(storage, c.webdriver),
|
||||
TwitterArchiver(storage, c.webdriver),
|
||||
WaybackArchiver(storage, c.webdriver, c.wayback_config)
|
||||
]
|
||||
|
@ -113,6 +118,9 @@ def process_sheet(c: Config):
|
|||
if success:
|
||||
logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
|
||||
break
|
||||
# only 1 retry possible for now
|
||||
if is_retry and Archiver.is_retry(result.status):
|
||||
result.status = Archiver.remove_retry(result.status)
|
||||
logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
|
||||
|
||||
if result:
|
||||
|
|
Ładowanie…
Reference in New Issue