retry mechanism

2022-06-08 13:39:52 +02:00 · 2022-06-08 13:39:52 +02:00 · 067e6d8954
commit 067e6d8954
--- a/archivers/base_archiver.py
+++ b/archivers/base_archiver.py
@ -1,7 +1,8 @@
-import os, datetime, shutil, hashlib, time, requests
+import os, datetime, shutil, hashlib, time, requests, re
 from dataclasses import dataclass
 from abc import ABC, abstractmethod
 from urllib.parse import urlparse
+from random import randrange

 import ffmpeg
 from loguru import logger
@ -27,6 +28,7 @@ class ArchiveResult:

 class Archiver(ABC):
    name = "default"
+    retry_regex = r"retrying at (\d+)$"

    def __init__(self, storage: Storage, driver):
        self.storage = storage
@ -95,7 +97,7 @@ class Archiver(ABC):
            key = self.get_key(path.replace("/", "_"))
            if '.' not in path:
                key += '.jpg'
-                
+
            filename = os.path.join(Storage.TMP_FOLDER, key)

            d = requests.get(media_url, headers=headers)
@ -226,3 +228,38 @@ class Archiver(ABC):
        thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)

        return (key_thumb, thumb_index_cdn_url)
+
+    def signal_retry_in(self, min_seconds=1800, max_seconds=7200):
+        """
+        sets state to retry in random between (min_seconds, max_seconds)
+        """
+        now = datetime.datetime.now().timestamp()
+        retry_at = int(now + randrange(min_seconds, max_seconds))
+        logger.debug(f"signaling {retry_at=}")
+        return ArchiveResult(status=f'retrying at {retry_at}')
+
+    def is_retry(status):
+        return re.search(Archiver.retry_regex, status) is not None
+
+    def should_retry_from_status(status):
+        """
+        checks status against message in signal_retry_in
+        returns true if enough time has elapsed, false otherwise
+        """
+        match = re.search(Archiver.retry_regex, status)
+        if match:
+            retry_at = int(match.group(1))
+            now = datetime.datetime.now().timestamp()
+            should_retry = now >= retry_at
+            logger.debug(f"{should_retry=} as {now=} >= {retry_at=}")
+            return should_retry
+        return False
+
+    def remove_retry(status):
+        """
+        transforms the status from retry into something else
+        """
+        new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0)
+        logger.debug(f"removing retry message at {status=}, got {new_status=}")
+        return new_status
+
--- a/archivers/wayback_archiver.py
+++ b/archivers/wayback_archiver.py
@ -63,7 +63,8 @@ class WaybackArchiver(Archiver):

        status_json = status_r.json()
        if status_json['status'] != 'success':
-            # TODO: if "please try again" in str(status_json).lower() then this can be retried in the future
+            if "please try again" in str(status_json).lower():
+                return self.signal_retry_in()
            return ArchiveResult(status='Internet Archive failed: ' + str(status_json))

        archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"
--- a/auto_archive.py
+++ b/auto_archive.py
@ -1,13 +1,15 @@
-import os, datetime, shutil, traceback
+import os, datetime, shutil, traceback, random

 from loguru import logger
 from slugify import slugify

-from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult
+from archivers import TelethonArchiver, TelegramArchiver, TiktokArchiver, YoutubeDLArchiver, TwitterArchiver, WaybackArchiver, ArchiveResult, Archiver
 from utils import GWorksheet, mkdir_if_not_exists, expand_url
 from configs import Config
 from storages import Storage

+random.seed()
+

 def update_sheet(gw, row, result: ArchiveResult):
    cell_updates = []
@ -72,7 +74,10 @@ def process_sheet(c: Config):
            original_status = gw.get_cell(row, 'status')
            status = gw.get_cell(row, 'status', fresh=original_status in ['', None] and url != '')

-            if url == '' or status not in ['', None]: continue
+            is_retry = False
+            if url == '' or status not in ['', None]:
+                is_retry = Archiver.should_retry_from_status(status)
+                if not is_retry: continue

            # All checks done - archival process starts here
            gw.set_cell(row, 'status', 'Archive in progress')
@ -85,9 +90,9 @@ def process_sheet(c: Config):
            # order matters, first to succeed excludes remaining
            active_archivers = [
                TelethonArchiver(storage, c.webdriver, c.telegram_config),
-                TelegramArchiver(storage, c.webdriver),
                TiktokArchiver(storage, c.webdriver),
                YoutubeDLArchiver(storage, c.webdriver, c.facebook_cookie),
+                TelegramArchiver(storage, c.webdriver),
                TwitterArchiver(storage, c.webdriver),
                WaybackArchiver(storage, c.webdriver, c.wayback_config)
            ]
@ -113,6 +118,9 @@ def process_sheet(c: Config):
                    if success:
                        logger.success(f'{archiver.name} succeeded on {row=}, {url=}')
                        break
+                    # only 1 retry possible for now
+                    if is_retry and Archiver.is_retry(result.status):
+                        result.status = Archiver.remove_retry(result.status)
                    logger.warning(f'{archiver.name} did not succeed on {row=}, final status: {result.status}')

            if result: