auto-archiver/archivers/twitter_archiver.py

71 wiersze
2.4 KiB
Python
Czysty Zwykły widok Historia

2022-02-25 12:55:43 +00:00
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from loguru import logger
import requests
from urllib.parse import urlparse
from .base_archiver import Archiver, ArchiveResult
2022-04-01 10:21:06 +00:00
import traceback
2022-02-25 12:55:43 +00:00
class TwitterArchiver(Archiver):
name = "twitter"
2022-04-27 18:20:30 +00:00
# DM added filenumber params todo fix ""
def download(self, url, check_if_exists=False, filenumber=None):
if filenumber is not None:
logger.debug(f'filenumber is {filenumber}')
2022-02-25 12:55:43 +00:00
if 'twitter.com' != self.get_netloc(url):
return False
tweet_id = urlparse(url).path.split('/')
2022-02-25 12:55:43 +00:00
if 'status' in tweet_id:
i = tweet_id.index('status')
tweet_id = tweet_id[i+1]
else:
return False
scr = TwitterTweetScraper(tweet_id)
try:
tweet = next(scr.get_items())
2022-03-31 22:23:32 +00:00
# except:
except Exception as e:
# logger.warning('wah wah')
2022-04-27 18:20:30 +00:00
# DM
2022-04-27 05:28:22 +00:00
logger.warning(f'TwitterArchiver cant get tweet for url {url} - can happen if a media sensitive tweet: \n{traceback.format_exc()}')
2022-02-25 12:55:43 +00:00
return False
if tweet.media is None:
return False
urls = []
2022-02-25 12:55:43 +00:00
for media in tweet.media:
if type(media) == Video:
variant = max(
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
urls.append(variant.url)
2022-02-25 12:55:43 +00:00
elif type(media) == Gif:
urls.append(media.variants[0].url)
2022-02-25 12:55:43 +00:00
elif type(media) == Photo:
2022-03-31 22:23:32 +00:00
# https://webtrickz.com/download-images-in-original-size-on-twitter/
# 'https://pbs.twimg.com/media/ExeUSW2UcAE6RbN?format=jpg&name=large'
# we want name=orig
# so can get original quality
foo = media.fullUrl
bar = foo.replace("name=large", "name=orig")
# urls.append(media.fullUrl)
urls.append(bar)
2022-02-25 12:55:43 +00:00
else:
logger.warning(f"Could not get media URL of {media}")
2022-04-27 18:20:30 +00:00
# page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json(), filenumber)
2022-02-25 12:55:43 +00:00
2022-04-27 18:20:30 +00:00
screenshot = self.get_screenshot(url, filenumber)
2022-02-25 12:55:43 +00:00
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)