auto-archiver/archivers/base_archiver.py

import os, datetime, shutil, hashlib, time, requests, re, mimetypes
from dataclasses import dataclass
from abc import ABC, abstractmethod
from urllib.parse import urlparse
from random import randrange

import ffmpeg
from loguru import logger
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from slugify import slugify

from storages import Storage
from utils import mkdir_if_not_exists


@dataclass
class ArchiveResult:
    status: str
    cdn_url: str = None
    thumbnail: str = None
    thumbnail_index: str = None
    duration: float = None
    title: str = None
    timestamp: datetime.datetime = None
    screenshot: str = None
    hash: str = None


class Archiver(ABC):
    name = "default"
    retry_regex = r"retrying at (\d+)$"

    def __init__(self, storage: Storage, driver):
        self.storage = storage
        self.driver = driver

    def __str__(self):
        return self.__class__.__name__

    def __repr__(self):
        return self.__str__()

    @abstractmethod
    def download(self, url, check_if_exists=False): pass

    def get_netloc(self, url):
        return urlparse(url).netloc

    # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
    def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
        """
        Generates an index.html page where each @urls_info is displayed
        """
        page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
            <body>
            <h2>Archived media from {self.name}</h2>
            <h3><a href="{url}">{url}</a></h3><ul>'''

        for url_info in urls_info:
            mime_global = self._guess_file_type(url_info["key"])
            preview = ""
            if mime_global == "image":
                preview = f'<img src="{url_info["cdn_url"]}" style="max-height:200px;max-width:400px;"></img>'
            elif mime_global == "video":
                preview = f'<video src="{url_info["cdn_url"]}" controls style="max-height:400px;max-width:400px;"></video>'
            page += f'''<li><a href="{url_info['cdn_url']}">{preview}{url_info['key']}</a>: {url_info['hash']}</li>'''

        page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
        page += f"</body></html>"

        page_key = self.get_html_key(url)
        page_filename = os.path.join(Storage.TMP_FOLDER, page_key)

        with open(page_filename, "w") as f:
            f.write(page)

        page_hash = self.get_hash(page_filename)

        self.storage.upload(page_filename, page_key, extra_args={
            'ACL': 'public-read', 'ContentType': 'text/html'})

        page_cdn = self.storage.get_cdn_url(page_key)
        return (page_cdn, page_hash, thumbnail)

    def _guess_file_type(self, path: str):
        """
        Receives a URL or filename and returns global mimetype like 'image' or 'video'
        see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
        """
        mime = mimetypes.guess_type(path)[0]
        if mime is not None:
            return mime.split("/")[0]
        return ""

    # eg images in a tweet save to cloud storage

    def generate_media_page(self, urls, url, object):
        """
        For a list of media urls, fetch them, upload them
        and call self.generate_media_page_html with them
        """
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
        }

        thumbnail = None
        uploaded_media = []
        for media_url in urls:
            key = self._get_key_from_url(media_url, ".jpg")

            filename = os.path.join(Storage.TMP_FOLDER, key)

            d = requests.get(media_url, headers=headers)
            with open(filename, 'wb') as f:
                f.write(d.content)

            self.storage.upload(filename, key)
            hash = self.get_hash(filename)
            cdn_url = self.storage.get_cdn_url(key)

            if thumbnail is None:
                thumbnail = cdn_url
            uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})

        return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)

    def get_key(self, filename):
        """
        returns a key in the format "[archiverName]_[filename]" includes extension
        """
        tail = os.path.split(filename)[1]  # returns filename.ext from full path
        _id, extension = os.path.splitext(tail)  # returns [filename, .ext]
        if 'unknown_video' in _id:
            _id = _id.replace('unknown_video', 'jpg')

        # long filenames can cause problems, so trim them if necessary
        if len(_id) > 128:
            _id = _id[-128:]

        return f'{self.name}_{_id}{extension}'

    def get_html_key(self, url):
        return self._get_key_from_url(url, ".html")

    def _get_key_from_url(self, url, with_extension: str = None, append_datetime: bool = False):
        """
        Receives a URL and returns a slugified version of the URL path
        if a string is passed in @with_extension the slug is appended with it if there is no "." in the slug
        if @append_date is true, the key adds a timestamp after the URL slug and before the extension
        """
        slug = slugify(urlparse(url).path)
        if append_datetime:
            slug += "-" + slugify(datetime.datetime.utcnow().isoformat())
        if with_extension is not None:
            if "." not in slug:
                slug += with_extension
        return self.get_key(slug)

    def get_hash(self, filename):
        with open(filename, "rb") as f:
            bytes = f.read()  # read entire file as bytes
            # TODO: customizable hash
            hash = hashlib.sha256(bytes)
            # option to use SHA3_512 instead
            # hash = hashlib.sha3_512(bytes)
        return hash.hexdigest()

    def get_screenshot(self, url):
        logger.debug(f"getting screenshot for {url=}")
        key = self._get_key_from_url(url, ".png", append_datetime=True)
        filename = os.path.join(Storage.TMP_FOLDER, key)

        # Accept cookies popup dismiss for ytdlp video
        if 'facebook.com' in url:
            try:
                logger.debug(f'Trying fb click accept cookie popup for {url}')
                self.driver.get("http://www.facebook.com")
                foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
                foo.click()
                logger.debug(f'fb click worked')
                # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
                time.sleep(2)
            except:
                logger.warning(f'Failed on fb accept cookies for url {url}')

        try:
            self.driver.get(url)
            time.sleep(6)
        except TimeoutException:
            logger.info("TimeoutException loading page for screenshot")

        self.driver.save_screenshot(filename)
        self.storage.upload(filename, key, extra_args={
                            'ACL': 'public-read', 'ContentType': 'image/png'})

        return self.storage.get_cdn_url(key)

    def get_thumbnails(self, filename, key, duration=None):
        thumbnails_folder = filename.split('.')[0] + '/'
        key_folder = key.split('.')[0] + '/'

        mkdir_if_not_exists(thumbnails_folder)

        fps = 0.5
        if duration is not None:
            duration = float(duration)

            if duration < 60:
                fps = 10.0 / duration
            elif duration < 120:
                fps = 20.0 / duration
            else:
                fps = 40.0 / duration

        stream = ffmpeg.input(filename)
        stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
        stream.output(thumbnails_folder + 'out%d.jpg').run()

        thumbnails = os.listdir(thumbnails_folder)
        cdn_urls = []
        for fname in thumbnails:
            if fname[-3:] == 'jpg':
                thumbnail_filename = thumbnails_folder + fname
                key = key_folder + fname

                self.storage.upload(thumbnail_filename, key)
                cdn_url = self.storage.get_cdn_url(key)
                cdn_urls.append(cdn_url)

        if len(cdn_urls) == 0:
            return ('', '')

        key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]

        index_page = f'''<html><head><title>{filename}</title><meta charset="UTF-8"></head>
            <body>'''

        for t in cdn_urls:
            index_page += f'<img src="{t}" />'

        index_page += f"</body></html>"
        index_fname = thumbnails_folder + 'index.html'

        with open(index_fname, 'w') as f:
            f.write(index_page)

        thumb_index = key_folder + 'index.html'

        self.storage.upload(index_fname, thumb_index, extra_args={
                            'ACL': 'public-read', 'ContentType': 'text/html'})
        shutil.rmtree(thumbnails_folder)

        thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)

        return (key_thumb, thumb_index_cdn_url)

    def signal_retry_in(self, min_seconds=1800, max_seconds=7200, **kwargs):
        """
        sets state to retry in random between (min_seconds, max_seconds)
        """
        now = datetime.datetime.now().timestamp()
        retry_at = int(now + randrange(min_seconds, max_seconds))
        logger.debug(f"signaling {retry_at=}")
        return ArchiveResult(status=f'retrying at {retry_at}', **kwargs)

    def is_retry(status):
        return re.search(Archiver.retry_regex, status) is not None

    def should_retry_from_status(status):
        """
        checks status against message in signal_retry_in
        returns true if enough time has elapsed, false otherwise
        """
        match = re.search(Archiver.retry_regex, status)
        if match:
            retry_at = int(match.group(1))
            now = datetime.datetime.now().timestamp()
            should_retry = now >= retry_at
            logger.debug(f"{should_retry=} as {now=} >= {retry_at=}")
            return should_retry
        return False

    def remove_retry(status):
        """
        transforms the status from retry into something else
        """
        new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0)
        logger.debug(f"removing retry message at {status=}, got {new_status=}")
        return new_status
improving media page with images and videos 2022-06-15 14:38:30 +00:00			`import os, datetime, shutil, hashlib, time, requests, re, mimetypes`
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`from dataclasses import dataclass`
			`from abc import ABC, abstractmethod`
cleanup and docs 2022-02-23 15:07:58 +00:00			`from urllib.parse import urlparse`
retry mechanism 2022-06-08 11:39:52 +00:00			`from random import randrange`
import cleanups 2022-06-03 16:30:12 +00:00
			`import ffmpeg`
Fix Selenium driver issues with telegram links 2022-03-18 10:10:27 +00:00			`from loguru import logger`
			`from selenium.common.exceptions import TimeoutException`
import cleanups 2022-06-03 16:30:12 +00:00			`from selenium.webdriver.common.by import By`
centralizing slugify url method 2022-06-14 18:15:14 +00:00			`from slugify import slugify`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`from storages import Storage`
creates tmp folder if not exists 2022-02-23 15:32:38 +00:00			`from utils import mkdir_if_not_exists`
split into multiple files MVP 2022-02-21 13:19:09 +00:00

			`@dataclass`
			`class ArchiveResult:`
			`status: str`
			`cdn_url: str = None`
			`thumbnail: str = None`
			`thumbnail_index: str = None`
			`duration: float = None`
			`title: str = None`
			`timestamp: datetime.datetime = None`
Add hash and screenshot methods; switch to more recent ytdl fork 2022-02-25 12:54:40 +00:00			`screenshot: str = None`
			`hash: str = None`
split into multiple files MVP 2022-02-21 13:19:09 +00:00

refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`class Archiver(ABC):`
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`name = "default"`
retry mechanism 2022-06-08 11:39:52 +00:00			`retry_regex = r"retrying at (\d+)$"`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
Add hash and screenshot methods; switch to more recent ytdl fork 2022-02-25 12:54:40 +00:00			`def __init__(self, storage: Storage, driver):`
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`self.storage = storage`
Add hash and screenshot methods; switch to more recent ytdl fork 2022-02-25 12:54:40 +00:00			`self.driver = driver`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
			`def __str__(self):`
			`return self.__class__.__name__`

refactor 2022-06-07 16:41:58 +00:00			`def __repr__(self):`
			`return self.__str__()`

refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`@abstractmethod`
			`def download(self, url, check_if_exists=False): pass`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
cleanup and docs 2022-02-23 15:07:58 +00:00			`def get_netloc(self, url):`
			`return urlparse(url).netloc`

Save to folders for S3 and GD. Google Drive (GD) storage 2022-05-11 14:39:44 +00:00			`# generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html`
isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00			`def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):`
minor improvements and cleanup archivers 2022-06-03 16:23:45 +00:00			`"""`
			`Generates an index.html page where each @urls_info is displayed`
			`"""`
fix the UTF-8 issue for cyrilic 2022-03-30 08:55:33 +00:00			`page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00			`<body>`
			`<h2>Archived media from {self.name}</h2>`
			`<h3><a href="{url}">{url}</a></h3><ul>'''`

isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00			`for url_info in urls_info:`
simplify display 2022-06-15 14:47:20 +00:00			`mime_global = self._guess_file_type(url_info["key"])`
improving media page with images and videos 2022-06-15 14:38:30 +00:00			`preview = ""`
			`if mime_global == "image":`
simplify display 2022-06-15 14:47:20 +00:00			`preview = f'<img src="{url_info["cdn_url"]}" style="max-height:200px;max-width:400px;"></img>'`
improving media page with images and videos 2022-06-15 14:38:30 +00:00			`elif mime_global == "video":`
simplify display 2022-06-15 14:47:20 +00:00			`preview = f'<video src="{url_info["cdn_url"]}" controls style="max-height:400px;max-width:400px;"></video>'`
improving media page with images and videos 2022-06-15 14:38:30 +00:00			`page += f'''<li><a href="{url_info['cdn_url']}">{preview}{url_info['key']}</a>: {url_info['hash']}</li>'''`
isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00
			`page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"`
			`page += f"</body></html>"`

centralizing slugify url method 2022-06-14 18:15:14 +00:00			`page_key = self.get_html_key(url)`
improving path operations 2022-06-08 09:11:09 +00:00			`page_filename = os.path.join(Storage.TMP_FOLDER, page_key)`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00
isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00			`with open(page_filename, "w") as f:`
			`f.write(page)`

			`page_hash = self.get_hash(page_filename)`

			`self.storage.upload(page_filename, page_key, extra_args={`
			`'ACL': 'public-read', 'ContentType': 'text/html'})`
Save to folders for S3 and GD. Google Drive (GD) storage 2022-05-11 14:39:44 +00:00
			`page_cdn = self.storage.get_cdn_url(page_key)`
isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00			`return (page_cdn, page_hash, thumbnail)`

simplify display 2022-06-15 14:47:20 +00:00			`def _guess_file_type(self, path: str):`
improving media page with images and videos 2022-06-15 14:38:30 +00:00			`"""`
simplify display 2022-06-15 14:47:20 +00:00			`Receives a URL or filename and returns global mimetype like 'image' or 'video'`
improving media page with images and videos 2022-06-15 14:38:30 +00:00			`see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types`
			`"""`
			`mime = mimetypes.guess_type(path)[0]`
			`if mime is not None:`
simplify display 2022-06-15 14:47:20 +00:00			`return mime.split("/")[0]`
			`return ""`
improving media page with images and videos 2022-06-15 14:38:30 +00:00
Save to folders for S3 and GD. Google Drive (GD) storage 2022-05-11 14:39:44 +00:00			`# eg images in a tweet save to cloud storage`
simplify display 2022-06-15 14:47:20 +00:00
using API instead of scraping 2022-06-15 19:25:15 +00:00			`def generate_media_page(self, urls, url, object):`
minor improvements and cleanup archivers 2022-06-03 16:23:45 +00:00			`"""`
			`For a list of media urls, fetch them, upload them`
			`and call self.generate_media_page_html with them`
			`"""`
isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00			`headers = {`
			`'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'`
			`}`

			`thumbnail = None`
			`uploaded_media = []`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00			`for media_url in urls:`
centralizing slugify url method 2022-06-14 18:15:14 +00:00			`key = self._get_key_from_url(media_url, ".jpg")`
retry mechanism 2022-06-08 11:39:52 +00:00
improving path operations 2022-06-08 09:11:09 +00:00			`filename = os.path.join(Storage.TMP_FOLDER, key)`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00
using API instead of scraping 2022-06-15 19:25:15 +00:00			`d = requests.get(media_url, headers=headers)`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00			`with open(filename, 'wb') as f:`
			`f.write(d.content)`

			`self.storage.upload(filename, key)`
			`hash = self.get_hash(filename)`
			`cdn_url = self.storage.get_cdn_url(key)`

			`if thumbnail is None:`
			`thumbnail = cdn_url`
isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00			`uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00
isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00			`return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)`
fix the UTF-8 issue for cyrilic 2022-03-30 08:55:33 +00:00
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`def get_key(self, filename):`
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`"""`
			`returns a key in the format "[archiverName]_[filename]" includes extension`
			`"""`
			`tail = os.path.split(filename)[1] # returns filename.ext from full path`
			`_id, extension = os.path.splitext(tail) # returns [filename, .ext]`
			`if 'unknown_video' in _id:`
			`_id = _id.replace('unknown_video', 'jpg')`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00
			`# long filenames can cause problems, so trim them if necessary`
			`if len(_id) > 128:`
			`_id = _id[-128:]`

refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`return f'{self.name}_{_id}{extension}'`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
centralizing slugify url method 2022-06-14 18:15:14 +00:00			`def get_html_key(self, url):`
			`return self._get_key_from_url(url, ".html")`

			`def _get_key_from_url(self, url, with_extension: str = None, append_datetime: bool = False):`
			`"""`
			`Receives a URL and returns a slugified version of the URL path`
			`if a string is passed in @with_extension the slug is appended with it if there is no "." in the slug`
			`if @append_date is true, the key adds a timestamp after the URL slug and before the extension`
			`"""`
			`slug = slugify(urlparse(url).path)`
			`if append_datetime:`
			`slug += "-" + slugify(datetime.datetime.utcnow().isoformat())`
			`if with_extension is not None:`
			`if "." not in slug:`
			`slug += with_extension`
			`return self.get_key(slug)`

Add hash and screenshot methods; switch to more recent ytdl fork 2022-02-25 12:54:40 +00:00			`def get_hash(self, filename):`
minor improvements and cleanup archivers 2022-06-03 16:23:45 +00:00			`with open(filename, "rb") as f:`
			`bytes = f.read() # read entire file as bytes`
			`# TODO: customizable hash`
			`hash = hashlib.sha256(bytes)`
			`# option to use SHA3_512 instead`
			`# hash = hashlib.sha3_512(bytes)`
Add hash and screenshot methods; switch to more recent ytdl fork 2022-02-25 12:54:40 +00:00			`return hash.hexdigest()`

			`def get_screenshot(self, url):`
refactor 2022-06-07 16:41:58 +00:00			`logger.debug(f"getting screenshot for {url=}")`
centralizing slugify url method 2022-06-14 18:15:14 +00:00			`key = self._get_key_from_url(url, ".png", append_datetime=True)`
improving path operations 2022-06-08 09:11:09 +00:00			`filename = os.path.join(Storage.TMP_FOLDER, key)`
Add hash and screenshot methods; switch to more recent ytdl fork 2022-02-25 12:54:40 +00:00
Save to folders for S3 and GD. Google Drive (GD) storage 2022-05-11 14:39:44 +00:00			`# Accept cookies popup dismiss for ytdlp video`
			`if 'facebook.com' in url:`
			`try:`
			`logger.debug(f'Trying fb click accept cookie popup for {url}')`
refactoring filenumber into subfolder 2022-05-26 17:18:29 +00:00			`self.driver.get("http://www.facebook.com")`
			`foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")`
Save to folders for S3 and GD. Google Drive (GD) storage 2022-05-11 14:39:44 +00:00			`foo.click()`
			`logger.debug(f'fb click worked')`
minor improvements and cleanup archivers 2022-06-03 16:23:45 +00:00			`# linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page`
Save to folders for S3 and GD. Google Drive (GD) storage 2022-05-11 14:39:44 +00:00			`time.sleep(2)`
			`except:`
			`logger.warning(f'Failed on fb accept cookies for url {url}')`

Fix Selenium driver issues with telegram links 2022-03-18 10:10:27 +00:00			`try:`
			`self.driver.get(url)`
			`time.sleep(6)`
			`except TimeoutException:`
			`logger.info("TimeoutException loading page for screenshot")`
Add hash and screenshot methods; switch to more recent ytdl fork 2022-02-25 12:54:40 +00:00
			`self.driver.save_screenshot(filename)`
			`self.storage.upload(filename, key, extra_args={`
			`'ACL': 'public-read', 'ContentType': 'image/png'})`
Save to folders for S3 and GD. Google Drive (GD) storage 2022-05-11 14:39:44 +00:00
Add hash and screenshot methods; switch to more recent ytdl fork 2022-02-25 12:54:40 +00:00			`return self.storage.get_cdn_url(key)`

cleanup and docs 2022-02-23 15:07:58 +00:00			`def get_thumbnails(self, filename, key, duration=None):`
			`thumbnails_folder = filename.split('.')[0] + '/'`
			`key_folder = key.split('.')[0] + '/'`

creates tmp folder if not exists 2022-02-23 15:32:38 +00:00			`mkdir_if_not_exists(thumbnails_folder)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
			`fps = 0.5`
			`if duration is not None:`
			`duration = float(duration)`

			`if duration < 60:`
			`fps = 10.0 / duration`
			`elif duration < 120:`
			`fps = 20.0 / duration`
			`else:`
			`fps = 40.0 / duration`

			`stream = ffmpeg.input(filename)`
			`stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)`
cleanup and docs 2022-02-23 15:07:58 +00:00			`stream.output(thumbnails_folder + 'out%d.jpg').run()`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
cleanup and docs 2022-02-23 15:07:58 +00:00			`thumbnails = os.listdir(thumbnails_folder)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`cdn_urls = []`
			`for fname in thumbnails:`
			`if fname[-3:] == 'jpg':`
cleanup and docs 2022-02-23 15:07:58 +00:00			`thumbnail_filename = thumbnails_folder + fname`
			`key = key_folder + fname`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`self.storage.upload(thumbnail_filename, key)`
Save to folders for S3 and GD. Google Drive (GD) storage 2022-05-11 14:39:44 +00:00			`cdn_url = self.storage.get_cdn_url(key)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`cdn_urls.append(cdn_url)`

			`if len(cdn_urls) == 0:`
returning empty string thumbs 2022-06-16 14:30:08 +00:00			`return ('', '')`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
			`key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]`

fix the UTF-8 issue for cyrilic 2022-03-30 08:55:33 +00:00			`index_page = f'''<html><head><title>{filename}</title><meta charset="UTF-8"></head>`
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`<body>'''`

			`for t in cdn_urls:`
			`index_page += f'<img src="{t}" />'`

			`index_page += f"</body></html>"`
cleanup and docs 2022-02-23 15:07:58 +00:00			`index_fname = thumbnails_folder + 'index.html'`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
			`with open(index_fname, 'w') as f:`
			`f.write(index_page)`

cleanup and docs 2022-02-23 15:07:58 +00:00			`thumb_index = key_folder + 'index.html'`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00			`self.storage.upload(index_fname, thumb_index, extra_args={`
			`'ACL': 'public-read', 'ContentType': 'text/html'})`
improved tmp folder management 2022-02-23 15:43:42 +00:00			`shutil.rmtree(thumbnails_folder)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
			`return (key_thumb, thumb_index_cdn_url)`
retry mechanism 2022-06-08 11:39:52 +00:00
bug fixes 2022-06-08 16:17:25 +00:00			`def signal_retry_in(self, min_seconds=1800, max_seconds=7200, **kwargs):`
retry mechanism 2022-06-08 11:39:52 +00:00			`"""`
			`sets state to retry in random between (min_seconds, max_seconds)`
			`"""`
			`now = datetime.datetime.now().timestamp()`
			`retry_at = int(now + randrange(min_seconds, max_seconds))`
			`logger.debug(f"signaling {retry_at=}")`
bug fixes 2022-06-08 16:17:25 +00:00			`return ArchiveResult(status=f'retrying at {retry_at}', **kwargs)`
retry mechanism 2022-06-08 11:39:52 +00:00
			`def is_retry(status):`
			`return re.search(Archiver.retry_regex, status) is not None`

			`def should_retry_from_status(status):`
			`"""`
			`checks status against message in signal_retry_in`
			`returns true if enough time has elapsed, false otherwise`
			`"""`
			`match = re.search(Archiver.retry_regex, status)`
			`if match:`
			`retry_at = int(match.group(1))`
			`now = datetime.datetime.now().timestamp()`
			`should_retry = now >= retry_at`
			`logger.debug(f"{should_retry=} as {now=} >= {retry_at=}")`
			`return should_retry`
			`return False`

			`def remove_retry(status):`
			`"""`
			`transforms the status from retry into something else`
			`"""`
			`new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0)`
			`logger.debug(f"removing retry message at {status=}, got {new_status=}")`
			`return new_status`