auto-archiver/archivers/wayback_archiver.py

import time, requests

from loguru import logger
from bs4 import BeautifulSoup

from storages import Storage
from .base_archiver import Archiver, ArchiveResult
from configs import WaybackConfig


class WaybackArchiver(Archiver):
    """
    This archiver could implement a check_if_exists by going to "https://web.archive.org/web/{url}"
    but that might not be desirable since the webpage might have been archived a long time ago and thus have changed
    """
    name = "wayback"

    def __init__(self, storage: Storage, driver, config: WaybackConfig):
        super(WaybackArchiver, self).__init__(storage, driver)
        self.config = config
        self.seen_urls = {}

    def download(self, url, check_if_exists=False):
        if self.config is None:
            logger.error('Missing Wayback config')
            return False
        if check_if_exists:
            if url in self.seen_urls: return self.seen_urls[url]

        screenshot = self.get_screenshot(url)
        logger.debug(f"POSTing {url=} to web.archive.org")
        ia_headers = {
            "Accept": "application/json",
            "Authorization": f"LOW {self.config.key}:{self.config.secret}"
        }
        r = requests.post('https://web.archive.org/save/', headers=ia_headers, data={'url': url})

        if r.status_code != 200:
            logger.warning(f"Internet archive failed with status of {r.status_code}")
            return ArchiveResult(status="Internet archive failed", screenshot=screenshot)

        if 'job_id' not in r.json() and 'message' in r.json():
            return self.custom_retry(r.json(), screenshot=screenshot)

        job_id = r.json()['job_id']
        logger.debug(f"GETting status for {job_id=} on {url=}")
        status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
        retries = 0

        # TODO: make the job queue parallel -> consider propagation of results back to sheet though
        # wait 90-120 seconds for the archive job to finish
        while (status_r.status_code != 200 or status_r.json()['status'] == 'pending') and retries < 30:
            time.sleep(3)
            try:
                logger.debug(f"GETting status for {job_id=} on {url=} [{retries=}]")
                status_r = requests.get(f'https://web.archive.org/save/status/{job_id}', headers=ia_headers)
            except:
                time.sleep(1)
            retries += 1

        if status_r.status_code != 200:
            return ArchiveResult(status=f"Internet archive failed: check https://web.archive.org/save/status/{job_id}", screenshot=screenshot)

        status_json = status_r.json()
        if status_json['status'] != 'success':
            return self.custom_retry(status_json, screenshot=screenshot)

        archive_url = f"https://web.archive.org/web/{status_json['timestamp']}/{status_json['original_url']}"

        try:
            req = requests.get(archive_url)
            parsed = BeautifulSoup(req.content, 'html.parser')
            title = parsed.find_all('title')[0].text
            if title == 'Wayback Machine':
                title = 'Could not get title'
        except:
            title = "Could not get title"
        screenshot = self.get_screenshot(url)
        self.seen_urls[url] = ArchiveResult(status='success', cdn_url=archive_url, title=title, screenshot=screenshot)
        return self.seen_urls[url]

    def custom_retry(self, json_data, **kwargs):
        logger.warning(f"Internet archive failed json \n {json_data}")
        if "please try again" in str(json_data).lower():
            return self.signal_retry_in(**kwargs)
        if "this host has been already captured" in str(json_data).lower():
            return self.signal_retry_in(**kwargs, min_seconds=86400, max_seconds=129600)  # 24h to 36h later
        return ArchiveResult(status=f"Internet archive failed: {json_data}", **kwargs)