auto-archiver/src/auto_archiver/archivers/telegram_archiver.py

73 wiersze
2.4 KiB
Python

import requests, re, html
from bs4 import BeautifulSoup
from loguru import logger
from . import Archiver
from ..core import Metadata, Media
class TelegramArchiver(Archiver):
"""
Archiver for telegram that does not require login, but the telethon_archiver is much more advised, will only return if at least one image or one video is found
"""
name = "telegram_archiver"
def __init__(self, config: dict) -> None:
super().__init__(config)
@staticmethod
def configs() -> dict:
return {}
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
# detect URLs that we definitely cannot handle
if 't.me' != item.netloc:
return False
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
# TODO: check if we can do this more resilient to variable URLs
if url[-8:] != "?embed=1":
url += "?embed=1"
t = requests.get(url, headers=headers)
s = BeautifulSoup(t.content, 'html.parser')
result = Metadata()
result.set_content(html.escape(str(t.content)))
if (timestamp := (s.find_all('time') or [{}])[0].get('datetime')):
result.set_timestamp(timestamp)
video = s.find("video")
if video is None:
logger.warning("could not find video")
image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
image_urls = []
for im in image_tags:
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
image_urls += urls
if not len(image_urls): return False
for img_url in image_urls:
result.add_media(Media(self.download_from_url(img_url)))
else:
video_url = video.get('src')
m_video = Media(self.download_from_url(video_url))
# extract duration from HTML
try:
duration = s.find_all('time')[0].contents[0]
if ':' in duration:
duration = float(duration.split(
':')[0]) * 60 + float(duration.split(':')[1])
else:
duration = float(duration)
m_video.set("duration", duration)
except: pass
result.add_media(m_video)
return result.success("telegram")