kopia lustrzana https://github.com/bellingcat/auto-archiver
twitter archiver improvements
rodzic
bd753b27ed
commit
2be539d39e
|
@ -1,6 +1,8 @@
|
||||||
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
|
||||||
from loguru import logger
|
import html
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
from loguru import logger
|
||||||
|
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
|
||||||
|
|
||||||
from .base_archiver import Archiver, ArchiveResult
|
from .base_archiver import Archiver, ArchiveResult
|
||||||
|
|
||||||
|
@ -11,6 +13,7 @@ class TwitterArchiver(Archiver):
|
||||||
def download(self, url, check_if_exists=False):
|
def download(self, url, check_if_exists=False):
|
||||||
|
|
||||||
if 'twitter.com' != self.get_netloc(url):
|
if 'twitter.com' != self.get_netloc(url):
|
||||||
|
logger.debug(f'{url=} is not from twitter')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
tweet_id = urlparse(url).path.split('/')
|
tweet_id = urlparse(url).path.split('/')
|
||||||
|
@ -18,6 +21,7 @@ class TwitterArchiver(Archiver):
|
||||||
i = tweet_id.index('status')
|
i = tweet_id.index('status')
|
||||||
tweet_id = tweet_id[i + 1]
|
tweet_id = tweet_id[i + 1]
|
||||||
else:
|
else:
|
||||||
|
logger.debug(f'{url=} does not contain "status"')
|
||||||
return False
|
return False
|
||||||
|
|
||||||
scr = TwitterTweetScraper(tweet_id)
|
scr = TwitterTweetScraper(tweet_id)
|
||||||
|
@ -29,8 +33,10 @@ class TwitterArchiver(Archiver):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if tweet.media is None:
|
if tweet.media is None:
|
||||||
logger.trace(f'No media found')
|
logger.debug(f'No media found, archiving tweet text only')
|
||||||
return False
|
screenshot = self.get_screenshot(url)
|
||||||
|
page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
|
||||||
|
return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot)
|
||||||
|
|
||||||
urls = []
|
urls = []
|
||||||
|
|
||||||
|
@ -50,4 +56,4 @@ class TwitterArchiver(Archiver):
|
||||||
|
|
||||||
screenshot = self.get_screenshot(url)
|
screenshot = self.get_screenshot(url)
|
||||||
|
|
||||||
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date)
|
return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content)
|
||||||
|
|
Ładowanie…
Reference in New Issue