kopia lustrzana https://github.com/bellingcat/auto-archiver
73 wiersze
2.4 KiB
Python
73 wiersze
2.4 KiB
Python
import requests, re, html
|
|
from bs4 import BeautifulSoup
|
|
from loguru import logger
|
|
|
|
from . import Archiver
|
|
from ..core import Metadata, Media
|
|
|
|
|
|
class TelegramArchiver(Archiver):
|
|
"""
|
|
Archiver for telegram that does not require login, but the telethon_archiver is much more advised, will only return if at least one image or one video is found
|
|
"""
|
|
name = "telegram_archiver"
|
|
|
|
def __init__(self, config: dict) -> None:
|
|
super().__init__(config)
|
|
|
|
@staticmethod
|
|
def configs() -> dict:
|
|
return {}
|
|
|
|
def download(self, item: Metadata) -> Metadata:
|
|
url = item.get_url()
|
|
# detect URLs that we definitely cannot handle
|
|
if 't.me' != item.netloc:
|
|
return False
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
|
}
|
|
|
|
# TODO: check if we can do this more resilient to variable URLs
|
|
if url[-8:] != "?embed=1":
|
|
url += "?embed=1"
|
|
|
|
t = requests.get(url, headers=headers)
|
|
s = BeautifulSoup(t.content, 'html.parser')
|
|
|
|
result = Metadata()
|
|
result.set_content(html.escape(str(t.content)))
|
|
if (timestamp := (s.find_all('time') or [{}])[0].get('datetime')):
|
|
result.set_timestamp(timestamp)
|
|
|
|
video = s.find("video")
|
|
if video is None:
|
|
logger.warning("could not find video")
|
|
image_tags = s.find_all(class_="tgme_widget_message_photo_wrap")
|
|
|
|
image_urls = []
|
|
for im in image_tags:
|
|
urls = [u.replace("'", "") for u in re.findall(r'url\((.*?)\)', im['style'])]
|
|
image_urls += urls
|
|
|
|
if not len(image_urls): return False
|
|
for img_url in image_urls:
|
|
result.add_media(Media(self.download_from_url(img_url)))
|
|
else:
|
|
video_url = video.get('src')
|
|
m_video = Media(self.download_from_url(video_url))
|
|
# extract duration from HTML
|
|
try:
|
|
duration = s.find_all('time')[0].contents[0]
|
|
if ':' in duration:
|
|
duration = float(duration.split(
|
|
':')[0]) * 60 + float(duration.split(':')[1])
|
|
else:
|
|
duration = float(duration)
|
|
m_video.set("duration", duration)
|
|
except: pass
|
|
result.add_media(m_video)
|
|
|
|
return result.success("telegram")
|