diff --git a/archivers/twitter_archiver.py b/archivers/twitter_archiver.py index 04ed578..29c43fe 100644 --- a/archivers/twitter_archiver.py +++ b/archivers/twitter_archiver.py @@ -1,6 +1,8 @@ -from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo -from loguru import logger + +import html from urllib.parse import urlparse +from loguru import logger +from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo from .base_archiver import Archiver, ArchiveResult @@ -11,6 +13,7 @@ class TwitterArchiver(Archiver): def download(self, url, check_if_exists=False): if 'twitter.com' != self.get_netloc(url): + logger.debug(f'{url=} is not from twitter') return False tweet_id = urlparse(url).path.split('/') @@ -18,6 +21,7 @@ class TwitterArchiver(Archiver): i = tweet_id.index('status') tweet_id = tweet_id[i + 1] else: + logger.debug(f'{url=} does not contain "status"') return False scr = TwitterTweetScraper(tweet_id) @@ -29,8 +33,10 @@ class TwitterArchiver(Archiver): return False if tweet.media is None: - logger.trace(f'No media found') - return False + logger.debug(f'No media found, archiving tweet text only') + screenshot = self.get_screenshot(url) + page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json())) + return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot) urls = [] @@ -50,4 +56,4 @@ class TwitterArchiver(Archiver): screenshot = self.get_screenshot(url) - return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date) + return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content)