2022-02-25 12:55:43 +00:00
|
|
|
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
|
|
|
from loguru import logger
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
|
|
from .base_archiver import Archiver, ArchiveResult
|
|
|
|
|
|
|
|
|
|
|
|
class TwitterArchiver(Archiver):
|
|
|
|
name = "twitter"
|
|
|
|
|
2022-05-26 17:18:29 +00:00
|
|
|
def download(self, url, check_if_exists=False):
|
|
|
|
|
2022-02-25 12:55:43 +00:00
|
|
|
if 'twitter.com' != self.get_netloc(url):
|
|
|
|
return False
|
|
|
|
|
2022-02-25 14:29:56 +00:00
|
|
|
tweet_id = urlparse(url).path.split('/')
|
2022-02-25 12:55:43 +00:00
|
|
|
if 'status' in tweet_id:
|
|
|
|
i = tweet_id.index('status')
|
2022-05-26 17:18:29 +00:00
|
|
|
tweet_id = tweet_id[i + 1]
|
2022-02-25 12:55:43 +00:00
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
|
|
|
scr = TwitterTweetScraper(tweet_id)
|
|
|
|
|
|
|
|
try:
|
|
|
|
tweet = next(scr.get_items())
|
2022-05-11 14:39:44 +00:00
|
|
|
except Exception as ex:
|
2022-05-26 17:18:29 +00:00
|
|
|
logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
|
2022-02-25 12:55:43 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
if tweet.media is None:
|
2022-05-11 14:39:44 +00:00
|
|
|
logger.trace(f'No media found')
|
2022-02-25 12:55:43 +00:00
|
|
|
return False
|
|
|
|
|
2022-02-28 07:41:45 +00:00
|
|
|
urls = []
|
2022-02-25 12:55:43 +00:00
|
|
|
|
|
|
|
for media in tweet.media:
|
|
|
|
if type(media) == Video:
|
|
|
|
variant = max(
|
|
|
|
[v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
|
2022-02-28 07:41:45 +00:00
|
|
|
urls.append(variant.url)
|
2022-02-25 12:55:43 +00:00
|
|
|
elif type(media) == Gif:
|
2022-02-28 07:41:45 +00:00
|
|
|
urls.append(media.variants[0].url)
|
2022-02-25 12:55:43 +00:00
|
|
|
elif type(media) == Photo:
|
2022-05-09 09:55:38 +00:00
|
|
|
urls.append(media.fullUrl.replace('name=large', 'name=orig'))
|
2022-02-25 12:55:43 +00:00
|
|
|
else:
|
|
|
|
logger.warning(f"Could not get media URL of {media}")
|
|
|
|
|
2022-05-26 17:18:29 +00:00
|
|
|
page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
|
2022-02-25 12:55:43 +00:00
|
|
|
|
2022-05-26 17:18:29 +00:00
|
|
|
screenshot = self.get_screenshot(url)
|
2022-02-25 12:55:43 +00:00
|
|
|
|
2022-02-28 07:41:45 +00:00
|
|
|
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)
|