kopia lustrzana https://github.com/bellingcat/auto-archiver
				
				
				
			
		
			
				
	
	
		
			60 wiersze
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
			
		
		
	
	
			60 wiersze
		
	
	
		
			2.2 KiB
		
	
	
	
		
			Python
		
	
	
| 
 | |
| import html
 | |
| from urllib.parse import urlparse
 | |
| from loguru import logger
 | |
| from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
 | |
| 
 | |
| from .base_archiver import Archiver, ArchiveResult
 | |
| 
 | |
| 
 | |
| class TwitterArchiver(Archiver):
 | |
|     name = "twitter"
 | |
| 
 | |
|     def download(self, url, check_if_exists=False):
 | |
| 
 | |
|         if 'twitter.com' != self.get_netloc(url):
 | |
|             logger.debug(f'{url=} is not from twitter')
 | |
|             return False
 | |
| 
 | |
|         tweet_id = urlparse(url).path.split('/')
 | |
|         if 'status' in tweet_id:
 | |
|             i = tweet_id.index('status')
 | |
|             tweet_id = tweet_id[i + 1]
 | |
|         else:
 | |
|             logger.debug(f'{url=} does not contain "status"')
 | |
|             return False
 | |
| 
 | |
|         scr = TwitterTweetScraper(tweet_id)
 | |
| 
 | |
|         try:
 | |
|             tweet = next(scr.get_items())
 | |
|         except Exception as ex:
 | |
|             logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}")
 | |
|             return False
 | |
| 
 | |
|         if tweet.media is None:
 | |
|             logger.debug(f'No media found, archiving tweet text only')
 | |
|             screenshot = self.get_screenshot(url)
 | |
|             page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(tweet.json()))
 | |
|             return ArchiveResult(status="success", cdn_url=page_cdn, title=tweet.content, timestamp=tweet.date, hash=page_hash, screenshot=screenshot)
 | |
| 
 | |
|         urls = []
 | |
| 
 | |
|         for media in tweet.media:
 | |
|             if type(media) == Video:
 | |
|                 variant = max(
 | |
|                     [v for v in media.variants if v.bitrate], key=lambda v: v.bitrate)
 | |
|                 urls.append(variant.url)
 | |
|             elif type(media) == Gif:
 | |
|                 urls.append(media.variants[0].url)
 | |
|             elif type(media) == Photo:
 | |
|                 urls.append(media.fullUrl.replace('name=large', 'name=orig'))
 | |
|             else:
 | |
|                 logger.warning(f"Could not get media URL of {media}")
 | |
| 
 | |
|         page_cdn, page_hash, thumbnail = self.generate_media_page(urls, url, tweet.json())
 | |
| 
 | |
|         screenshot = self.get_screenshot(url)
 | |
| 
 | |
|         return ArchiveResult(status="success", cdn_url=page_cdn, screenshot=screenshot, hash=page_hash, thumbnail=thumbnail, timestamp=tweet.date, title=tweet.content)
 |