auto-archiver/archivers/telegram_archiver.py

import os
import requests
from bs4 import BeautifulSoup

from .base_archiver import Archiver, ArchiveResult


class TelegramArchiver(Archiver):
    name = "telegram"

    def download(self, url, check_if_exists=False):
        # detect URLs that we definitely cannot handle
        if 't.me' != self.get_netloc(url):
            return False

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
        }
        status = "success"

        original_url = url

        # TODO: check if we can do this more resilient to variable URLs
        if url[-8:] != "?embed=1":
            url += "?embed=1"

        t = requests.get(url, headers=headers)
        s = BeautifulSoup(t.content, 'html.parser')
        video = s.find("video")

        if video is None:
            return False  # could not find video

        video_url = video.get('src')
        video_id = video_url.split('/')[-1].split('?')[0]
        key = self.get_key(video_id)

        filename = 'tmp/' + key

        if check_if_exists and self.storage.exists(key):
            status = 'already archived'
            cdn_url = self.storage.get_cdn_url(key)

        v = requests.get(video_url, headers=headers)

        with open(filename, 'wb') as f:
            f.write(v.content)

        if status != 'already archived':
            cdn_url = self.storage.get_cdn_url(key)

            self.storage.upload(filename, key)

        # extract duration from HTML
        duration = s.find_all('time')[0].contents[0]
        if ':' in duration:
            duration = float(duration.split(':')[0]) * 60
            + float(duration.split(':')[1])
        else:
            duration = float(duration)

        # process thumbnails
        key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
        os.remove(filename)

        return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
                             duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`import os`
			`import requests`
			`from bs4 import BeautifulSoup`

refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`from .base_archiver import Archiver, ArchiveResult`
split into multiple files MVP 2022-02-21 13:19:09 +00:00

			`class TelegramArchiver(Archiver):`
			`name = "telegram"`
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`def download(self, url, check_if_exists=False):`
			`# detect URLs that we definitely cannot handle`
cleanup and docs 2022-02-23 15:07:58 +00:00			`if 't.me' != self.get_netloc(url):`
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`return False`

			`headers = {`
			`'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'`
			`}`
			`status = "success"`

			`original_url = url`

cleanup and docs 2022-02-23 15:07:58 +00:00			`# TODO: check if we can do this more resilient to variable URLs`
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`if url[-8:] != "?embed=1":`
			`url += "?embed=1"`

			`t = requests.get(url, headers=headers)`
			`s = BeautifulSoup(t.content, 'html.parser')`
			`video = s.find("video")`

			`if video is None:`
			`return False # could not find video`

			`video_url = video.get('src')`
cleanup and docs 2022-02-23 15:07:58 +00:00			`video_id = video_url.split('/')[-1].split('?')[0]`
			`key = self.get_key(video_id)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`filename = 'tmp/' + key`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`if check_if_exists and self.storage.exists(key):`
			`status = 'already archived'`
			`cdn_url = self.storage.get_cdn_url(key)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
			`v = requests.get(video_url, headers=headers)`

			`with open(filename, 'wb') as f:`
			`f.write(v.content)`

			`if status != 'already archived':`
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`cdn_url = self.storage.get_cdn_url(key)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
refactoring storage and bringing changes from origin 2022-02-22 15:03:35 +00:00			`self.storage.upload(filename, key)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00
			`# extract duration from HTML`
			`duration = s.find_all('time')[0].contents[0]`
			`if ':' in duration:`
making code more resilient to exceptions 2022-02-23 12:57:11 +00:00			`duration = float(duration.split(':')[0]) * 60`
			`+ float(duration.split(':')[1])`
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`else:`
			`duration = float(duration)`

			`# process thumbnails`
cleanup and docs 2022-02-23 15:07:58 +00:00			`key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)`
split into multiple files MVP 2022-02-21 13:19:09 +00:00			`os.remove(filename)`

			`return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,`
			`duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))`