auto-archiver/archivers/base_archiver.py

import os
import ffmpeg
import datetime
import shutil
from dataclasses import dataclass
from abc import ABC, abstractmethod
from urllib.parse import urlparse
import hashlib
import time
import requests

from storages import Storage
from utils import mkdir_if_not_exists


@dataclass
class ArchiveResult:
    status: str
    cdn_url: str = None
    thumbnail: str = None
    thumbnail_index: str = None
    duration: float = None
    title: str = None
    timestamp: datetime.datetime = None
    screenshot: str = None
    hash: str = None


class Archiver(ABC):
    name = "default"

    def __init__(self, storage: Storage, driver):
        self.storage = storage
        self.driver = driver

    def __str__(self):
        return self.__class__.__name__

    @abstractmethod
    def download(self, url, check_if_exists=False): pass

    def get_netloc(self, url):
        return urlparse(url).netloc

    def get_html_key(self, url):
        return self.get_key(urlparse(url).path.replace("/", "_") + ".html")

    def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
        page = f'''<html><head><title>{url}</title></head>
            <body>
            <h2>Archived media from {self.name}</h2>
            <h3><a href="{url}">{url}</a></h3><ul>'''

        for url_info in urls_info:
            page += f'''<li><a href="{url_info['cdn_url']}">{url_info['key']}</a>: {url_info['hash']}</li>'''

        page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
        page += f"</body></html>"

        page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")
        page_filename = 'tmp/' + page_key
        page_cdn = self.storage.get_cdn_url(page_key)

        with open(page_filename, "w") as f:
            f.write(page)

        page_hash = self.get_hash(page_filename)

        self.storage.upload(page_filename, page_key, extra_args={
            'ACL': 'public-read', 'ContentType': 'text/html'})
        return (page_cdn, page_hash, thumbnail)

    def generate_media_page(self, urls, url, object):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
        }

        thumbnail = None
        uploaded_media = []
        for media_url in urls:
            path = urlparse(media_url).path
            key = self.get_key(path.replace("/", "_"))
            if '.' not in path:
                key += '.jpg'

            filename = 'tmp/' + key

            d = requests.get(media_url, headers=headers)
            with open(filename, 'wb') as f:
                f.write(d.content)

            self.storage.upload(filename, key)
            hash = self.get_hash(filename)
            cdn_url = self.storage.get_cdn_url(key)

            if thumbnail is None:
                thumbnail = cdn_url
            uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})

        return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
    def get_key(self, filename):
        """
        returns a key in the format "[archiverName]_[filename]" includes extension
        """
        tail = os.path.split(filename)[1]  # returns filename.ext from full path
        _id, extension = os.path.splitext(tail)  # returns [filename, .ext]
        if 'unknown_video' in _id:
            _id = _id.replace('unknown_video', 'jpg')

        # long filenames can cause problems, so trim them if necessary
        if len(_id) > 128:
            _id = _id[-128:]

        return f'{self.name}_{_id}{extension}'

    def get_hash(self, filename):
        f = open(filename, "rb")
        bytes = f.read()  # read entire file as bytes
        hash = hashlib.sha256(bytes)
        f.close()
        return hash.hexdigest()

    def get_screenshot(self, url):
        key = self.get_key(urlparse(url).path.replace(
            "/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")
        filename = 'tmp/' + key

        self.driver.get(url)
        time.sleep(6)

        self.driver.save_screenshot(filename)
        self.storage.upload(filename, key, extra_args={
                            'ACL': 'public-read', 'ContentType': 'image/png'})
        return self.storage.get_cdn_url(key)

    def get_thumbnails(self, filename, key, duration=None):
        thumbnails_folder = filename.split('.')[0] + '/'
        key_folder = key.split('.')[0] + '/'

        mkdir_if_not_exists(thumbnails_folder)

        fps = 0.5
        if duration is not None:
            duration = float(duration)

            if duration < 60:
                fps = 10.0 / duration
            elif duration < 120:
                fps = 20.0 / duration
            else:
                fps = 40.0 / duration

        stream = ffmpeg.input(filename)
        stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
        stream.output(thumbnails_folder + 'out%d.jpg').run()

        thumbnails = os.listdir(thumbnails_folder)
        cdn_urls = []
        for fname in thumbnails:
            if fname[-3:] == 'jpg':
                thumbnail_filename = thumbnails_folder + fname
                key = key_folder + fname

                cdn_url = self.storage.get_cdn_url(key)

                self.storage.upload(thumbnail_filename, key)

                cdn_urls.append(cdn_url)

        if len(cdn_urls) == 0:
            return ('None', 'None')

        key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]

        index_page = f'''<html><head><title>{filename}</title></head>
            <body>'''

        for t in cdn_urls:
            index_page += f'<img src="{t}" />'

        index_page += f"</body></html>"
        index_fname = thumbnails_folder + 'index.html'

        with open(index_fname, 'w') as f:
            f.write(index_page)

        thumb_index = key_folder + 'index.html'

        self.storage.upload(index_fname, thumb_index, extra_args={
                            'ACL': 'public-read', 'ContentType': 'text/html'})
        shutil.rmtree(thumbnails_folder)

        thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)

        return (key_thumb, thumb_index_cdn_url)
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`import os`
			`import ffmpeg`
			`import datetime`
improved tmp folder management 2022-02-23 15:43:42 +00:00			`import shutil`
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`from dataclasses import dataclass`
			`from abc import ABC, abstractmethod`
cleanup and docs 2022-02-23 15:07:58 +00:00			`from urllib.parse import urlparse`
Add hash and screenshot methods; switch to more recent ytdl fork 2022-02-25 12:54:40 +00:00			`import hashlib`
			`import time`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00			`import requests`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`from storages import Storage`
creates tmp folder if not exists 2022-02-23 15:32:38 +00:00			`from utils import mkdir_if_not_exists`
split into multiple files MVP 2022-02-21 13:19:09 +00:00

			`@dataclass`
			`class ArchiveResult:`
			`status: str`
			`cdn_url: str = None`
			`thumbnail: str = None`
			`thumbnail_index: str = None`
			`duration: float = None`
			`title: str = None`
			`timestamp: datetime.datetime = None`
Add hash and screenshot methods; switch to more recent ytdl fork 2022-02-25 12:54:40 +00:00			`screenshot: str = None`
			`hash: str = None`
split into multiple files MVP 2022-02-21 13:19:09 +00:00

refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`class Archiver(ABC):`
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`name = "default"`

Add hash and screenshot methods; switch to more recent ytdl fork 2022-02-25 12:54:40 +00:00			`def __init__(self, storage: Storage, driver):`
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`self.storage = storage`
Add hash and screenshot methods; switch to more recent ytdl fork 2022-02-25 12:54:40 +00:00			`self.driver = driver`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
			`def __str__(self):`
			`return self.__class__.__name__`

refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`@abstractmethod`
			`def download(self, url, check_if_exists=False): pass`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
cleanup and docs 2022-02-23 15:07:58 +00:00			`def get_netloc(self, url):`
			`return urlparse(url).netloc`

isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00			`def get_html_key(self, url):`
			`return self.get_key(urlparse(url).path.replace("/", "_") + ".html")`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00
isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00			`def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00			`page = f'''<html><head><title>{url}</title></head>`
			`<body>`
			`<h2>Archived media from {self.name}</h2>`
			`<h3><a href="{url}">{url}</a></h3><ul>'''`

isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00			`for url_info in urls_info:`
			`page += f'''<li><a href="{url_info['cdn_url']}">{url_info['key']}</a>: {url_info['hash']}</li>'''`

			`page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"`
			`page += f"</body></html>"`

			`page_key = self.get_key(urlparse(url).path.replace("/", "_") + ".html")`
			`page_filename = 'tmp/' + page_key`
			`page_cdn = self.storage.get_cdn_url(page_key)`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00
isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00			`with open(page_filename, "w") as f:`
			`f.write(page)`

			`page_hash = self.get_hash(page_filename)`

			`self.storage.upload(page_filename, page_key, extra_args={`
			`'ACL': 'public-read', 'ContentType': 'text/html'})`
			`return (page_cdn, page_hash, thumbnail)`

			`def generate_media_page(self, urls, url, object):`
			`headers = {`
			`'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'`
			`}`

			`thumbnail = None`
			`uploaded_media = []`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00			`for media_url in urls:`
			`path = urlparse(media_url).path`
			`key = self.get_key(path.replace("/", "_"))`
			`if '.' not in path:`
			`key += '.jpg'`

			`filename = 'tmp/' + key`

			`d = requests.get(media_url, headers=headers)`
			`with open(filename, 'wb') as f:`
			`f.write(d.content)`

			`self.storage.upload(filename, key)`
			`hash = self.get_hash(filename)`
			`cdn_url = self.storage.get_cdn_url(key)`

			`if thumbnail is None:`
			`thumbnail = cdn_url`
isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00			`uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00
isloates html page generation logic so it can be reused 2022-03-16 18:50:44 +00:00			`return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`def get_key(self, filename):`
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`"""`
			`returns a key in the format "[archiverName]_[filename]" includes extension`
			`"""`
			`tail = os.path.split(filename)[1] # returns filename.ext from full path`
			`_id, extension = os.path.splitext(tail) # returns [filename, .ext]`
			`if 'unknown_video' in _id:`
			`_id = _id.replace('unknown_video', 'jpg')`
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00
			`# long filenames can cause problems, so trim them if necessary`
			`if len(_id) > 128:`
			`_id = _id[-128:]`

refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`return f'{self.name}_{_id}{extension}'`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
Add hash and screenshot methods; switch to more recent ytdl fork 2022-02-25 12:54:40 +00:00			`def get_hash(self, filename):`
			`f = open(filename, "rb")`
			`bytes = f.read() # read entire file as bytes`
			`hash = hashlib.sha256(bytes)`
			`f.close()`
			`return hash.hexdigest()`

			`def get_screenshot(self, url):`
			`key = self.get_key(urlparse(url).path.replace(`
			`"/", "_") + datetime.datetime.utcnow().isoformat().replace(" ", "_") + ".png")`
			`filename = 'tmp/' + key`

			`self.driver.get(url)`
			`time.sleep(6)`

			`self.driver.save_screenshot(filename)`
			`self.storage.upload(filename, key, extra_args={`
			`'ACL': 'public-read', 'ContentType': 'image/png'})`
			`return self.storage.get_cdn_url(key)`

cleanup and docs 2022-02-23 15:07:58 +00:00			`def get_thumbnails(self, filename, key, duration=None):`
			`thumbnails_folder = filename.split('.')[0] + '/'`
			`key_folder = key.split('.')[0] + '/'`

creates tmp folder if not exists 2022-02-23 15:32:38 +00:00			`mkdir_if_not_exists(thumbnails_folder)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
			`fps = 0.5`
			`if duration is not None:`
			`duration = float(duration)`

			`if duration < 60:`
			`fps = 10.0 / duration`
			`elif duration < 120:`
			`fps = 20.0 / duration`
			`else:`
			`fps = 40.0 / duration`

			`stream = ffmpeg.input(filename)`
			`stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)`
cleanup and docs 2022-02-23 15:07:58 +00:00			`stream.output(thumbnails_folder + 'out%d.jpg').run()`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
cleanup and docs 2022-02-23 15:07:58 +00:00			`thumbnails = os.listdir(thumbnails_folder)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`cdn_urls = []`
			`for fname in thumbnails:`
			`if fname[-3:] == 'jpg':`
cleanup and docs 2022-02-23 15:07:58 +00:00			`thumbnail_filename = thumbnails_folder + fname`
			`key = key_folder + fname`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`cdn_url = self.storage.get_cdn_url(key)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`self.storage.upload(thumbnail_filename, key)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
			`cdn_urls.append(cdn_url)`

			`if len(cdn_urls) == 0:`
			`return ('None', 'None')`

			`key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]`

			`index_page = f'''<html><head><title>{filename}</title></head>`
			`<body>'''`

			`for t in cdn_urls:`
			`index_page += f'<img src="{t}" />'`

			`index_page += f"</body></html>"`
cleanup and docs 2022-02-23 15:07:58 +00:00			`index_fname = thumbnails_folder + 'index.html'`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
			`with open(index_fname, 'w') as f:`
			`f.write(index_page)`

cleanup and docs 2022-02-23 15:07:58 +00:00			`thumb_index = key_folder + 'index.html'`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
Generate archivers for Telegram posts with images; move generation to function in base_archiver 2022-02-28 07:41:45 +00:00			`self.storage.upload(index_fname, thumb_index, extra_args={`
			`'ACL': 'public-read', 'ContentType': 'text/html'})`
improved tmp folder management 2022-02-23 15:43:42 +00:00			`shutil.rmtree(thumbnails_folder)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
			`return (key_thumb, thumb_index_cdn_url)`