retry mechanism

pull/33/head
msramalho 2022-06-08 13:39:52 +02:00
rodzic 6dcb59fea6
commit 067e6d8954
3 zmienionych plików z 53 dodań i 7 usunięć

Wyświetl plik

@ -1,7 +1,8 @@
import os, datetime, shutil, hashlib, time, requests
import os, datetime, shutil, hashlib, time, requests, re
from dataclasses import dataclass
from abc import ABC, abstractmethod
from urllib.parse import urlparse
from random import randrange
import ffmpeg
from loguru import logger
@ -27,6 +28,7 @@ class ArchiveResult:
class Archiver(ABC):
name = "default"
retry_regex = r"retrying at (\d+)$"
def __init__(self, storage: Storage, driver):
self.storage = storage
@ -95,7 +97,7 @@ class Archiver(ABC):
key = self.get_key(path.replace("/", "_"))
if '.' not in path:
key += '.jpg'
filename = os.path.join(Storage.TMP_FOLDER, key)
d = requests.get(media_url, headers=headers)
@ -226,3 +228,38 @@ class Archiver(ABC):
thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
return (key_thumb, thumb_index_cdn_url)
def signal_retry_in(self, min_seconds=1800, max_seconds=7200):
"""
sets state to retry in random between (min_seconds, max_seconds)
"""
now = datetime.datetime.now().timestamp()
retry_at = int(now + randrange(min_seconds, max_seconds))
logger.debug(f"signaling {retry_at=}")
return ArchiveResult(status=f'retrying at {retry_at}')
def is_retry(status):
return re.search(Archiver.retry_regex, status) is not None
def should_retry_from_status(status):
"""
checks status against message in signal_retry_in
returns true if enough time has elapsed, false otherwise
"""
match = re.search(Archiver.retry_regex, status)
if match:
retry_at = int(match.group(1))
now = datetime.datetime.now().timestamp()
should_retry = now >= retry_at
logger.debug(f"{should_retry=} as {now=} >= {retry_at=}")
return should_retry
return False
def remove_retry(status):
"""
transforms the status from retry into something else
"""
new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0)
logger.debug(f"removing retry message at {status=}, got {new_status=}")
return new_status

Wyświetl plik

@ -63,7 +63,8 @@ class WaybackArchiver(Archiver):
status_json = status_r.json()
if status_json['status'] != 'success':
# TODO: if "please try again" in str(status_json).lower() then this can be retried in the future
if "please try again" in str(status_json).lower():
return self.signal_retry_in()
return ArchiveResult(status='Internet Archive failed: ' + str(status_json))
archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"

Wyświetl plik

@ -1,13 +1,15 @@
import os, datetime, shutil, traceback
import os, datetime, shutil, traceback, random
from loguru import logger
from slugify import slugify
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult
from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult, Archiver
from utils import GWorksheet, mkdir_if_not_exists, expand_url
from configs import Config
from storages import Storage
random.seed()
def update_sheet(gw, row, result: ArchiveResult):
cell_updates = []
@ -72,7 +74,10 @@ def process_sheet(c: Config):
original_status = gw.get_cell(row, 'status')
status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')
if url == '' or status not in ['', None]: continue
is_retry = False
if url == '' or status not in ['', None]:
is_retry = Archiver.should_retry_from_status(status)
if not is_retry: continue
# All checks done - archival process starts here
gw.set_cell(row, 'status', 'Archive in progress')
@ -85,9 +90,9 @@ def process_sheet(c: Config):
# order matters, first to succeed excludes remaining
active_archivers = [
TelethonArchiver(storage, c.webdriver, c.telegram_config),
TelegramArchiver(storage, c.webdriver),
TiktokArchiver(storage, c.webdriver),
YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
TelegramArchiver(storage, c.webdriver),
TwitterArchiver(storage, c.webdriver),
WaybackArchiver(storage, c.webdriver, c.wayback_config)
]
@ -113,6 +118,9 @@ def process_sheet(c: Config):
if success:
logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
break
# only 1 retry possible for now
if is_retry and Archiver.is_retry(result.status):
result.status = Archiver.remove_retry(result.status)
logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')
if result: