twitter archiver improvements

pull/33/head
msramalho 2022-06-14 20:55:43 +02:00
rodzic bd753b27ed
commit 2be539d39e
1 zmienionych plików z 11 dodań i 5 usunięć

Wyświetl plik

@ -1,6 +1,8 @@
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from loguru import logger import html
from urllib.parse import urlparse from urllib.parse import urlparse
from loguru import logger
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from .base_archiver import Archiver, ArchiveResult from .base_archiver import Archiver, ArchiveResult
@ -11,6 +13,7 @@ class TwitterArchiver(Archiver):
def download(self, url, check_if_exists=False): def download(self, url, check_if_exists=False):
if 'twitter.com' != self.get_netloc(url): if 'twitter.com' != self.get_netloc(url):
logger.debug(f'{url=} is not from twitter')
return False return False
tweet_id = urlparse(url).path.split('/') tweet_id = urlparse(url).path.split('/')
@ -18,6 +21,7 @@ class TwitterArchiver(Archiver):
i = tweet_id.index('status') i = tweet_id.index('status')
tweet_id = tweet_id[i + 1] tweet_id = tweet_id[i + 1]
else: else:
logger.debug(f'{url=} does not contain "status"')
return False return False
scr = TwitterTweetScraper(tweet_id) scr = TwitterTweetScraper(tweet_id)
@ -29,8 +33,10 @@ class TwitterArchiver(Archiver):
return False return False
if tweet.media is None: if tweet.media is None:
logger.trace(f'No media found') logger.debug(f'No media found, archiving tweet text only')
return False screenshot = self.get_screenshot(url)
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot)
urls = [] urls = []
@ -50,4 +56,4 @@ class TwitterArchiver(Archiver):
screenshot = self.get_screenshot(url) screenshot = self.get_screenshot(url)
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date) return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content)