2022-02-21 13:19:09 +00:00
|
|
|
import os
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
2022-02-22 15:03:35 +00:00
|
|
|
from .base_archiver import Archiver, ArchiveResult
|
2022-02-21 13:19:09 +00:00
|
|
|
|
|
|
|
|
|
|
|
class TelegramArchiver(Archiver):
|
|
|
|
name = "telegram"
|
2022-02-22 15:03:35 +00:00
|
|
|
|
2022-02-21 13:19:09 +00:00
|
|
|
def download(self, url, check_if_exists=False):
|
|
|
|
# detect URLs that we definitely cannot handle
|
2022-02-23 15:07:58 +00:00
|
|
|
if 't.me' != self.get_netloc(url):
|
2022-02-21 13:19:09 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
|
|
|
|
}
|
|
|
|
status = "success"
|
|
|
|
|
|
|
|
original_url = url
|
|
|
|
|
2022-02-23 15:07:58 +00:00
|
|
|
# TODO: check if we can do this more resilient to variable URLs
|
2022-02-21 13:19:09 +00:00
|
|
|
if url[-8:] != "?embed=1":
|
|
|
|
url += "?embed=1"
|
|
|
|
|
|
|
|
t = requests.get(url, headers=headers)
|
|
|
|
s = BeautifulSoup(t.content, 'html.parser')
|
|
|
|
video = s.find("video")
|
|
|
|
|
|
|
|
if video is None:
|
|
|
|
return False # could not find video
|
|
|
|
|
|
|
|
video_url = video.get('src')
|
2022-02-23 15:07:58 +00:00
|
|
|
video_id = video_url.split('/')[-1].split('?')[0]
|
|
|
|
key = self.get_key(video_id)
|
2022-02-21 13:19:09 +00:00
|
|
|
|
2022-02-22 15:03:35 +00:00
|
|
|
filename = 'tmp/' + key
|
2022-02-21 13:19:09 +00:00
|
|
|
|
2022-02-22 15:03:35 +00:00
|
|
|
if check_if_exists and self.storage.exists(key):
|
|
|
|
status = 'already archived'
|
|
|
|
cdn_url = self.storage.get_cdn_url(key)
|
2022-02-21 13:19:09 +00:00
|
|
|
|
|
|
|
v = requests.get(video_url, headers=headers)
|
|
|
|
|
|
|
|
with open(filename, 'wb') as f:
|
|
|
|
f.write(v.content)
|
|
|
|
|
|
|
|
if status != 'already archived':
|
2022-02-22 15:03:35 +00:00
|
|
|
cdn_url = self.storage.get_cdn_url(key)
|
2022-02-21 13:19:09 +00:00
|
|
|
|
2022-02-22 15:03:35 +00:00
|
|
|
self.storage.upload(filename, key)
|
2022-02-21 13:19:09 +00:00
|
|
|
|
|
|
|
# extract duration from HTML
|
|
|
|
duration = s.find_all('time')[0].contents[0]
|
|
|
|
if ':' in duration:
|
2022-02-23 12:57:11 +00:00
|
|
|
duration = float(duration.split(':')[0]) * 60
|
|
|
|
+ float(duration.split(':')[1])
|
2022-02-21 13:19:09 +00:00
|
|
|
else:
|
|
|
|
duration = float(duration)
|
|
|
|
|
|
|
|
# process thumbnails
|
2022-02-23 15:07:58 +00:00
|
|
|
key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
|
2022-02-21 13:19:09 +00:00
|
|
|
os.remove(filename)
|
|
|
|
|
|
|
|
return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index,
|
|
|
|
duration=duration, title=original_url, timestamp=s.find_all('time')[1].get('datetime'))
|