From 725bab82409bf7c8390af2914e4ae9c61a13d99d Mon Sep 17 00:00:00 2001 From: msramalho <19508417+msramalho@users.noreply.github.com> Date: Wed, 18 Jan 2023 00:15:18 +0000 Subject: [PATCH] twitter archivers --- src/archivers/__init__.py | 10 +- src/archivers/telethon_archiverv2.py | 6 +- src/archivers/twitter_api_archiverv2.py | 97 +++++++++++++ src/archivers/twitter_archiverv2.py | 137 ++++++++++++++++++ src/enrichers/thumbnail_enricher.py | 1 + src/formatters/templates/html_template.html | 8 +- .../templates/{media.html => macros.html} | 6 + src/metadata.py | 18 ++- src/orchestrator.py | 9 +- 9 files changed, 273 insertions(+), 19 deletions(-) create mode 100644 src/archivers/twitter_api_archiverv2.py create mode 100644 src/archivers/twitter_archiverv2.py rename src/formatters/templates/{media.html => macros.html} (87%) diff --git a/src/archivers/__init__.py b/src/archivers/__init__.py index a2cb67c..f25668d 100644 --- a/src/archivers/__init__.py +++ b/src/archivers/__init__.py @@ -2,13 +2,15 @@ from .base_archiver import Archiver, ArchiveResult from .archiver import Archiverv2 from .telegram_archiver import TelegramArchiver -from .telethon_archiver import TelethonArchiver +# from .telethon_archiver import TelethonArchiver from .tiktok_archiver import TiktokArchiver from .wayback_archiver import WaybackArchiver from .youtubedl_archiver import YoutubeDLArchiver -from .twitter_archiver import TwitterArchiver +# from .twitter_archiver import TwitterArchiver from .vk_archiver import VkArchiver -from .twitter_api_archiver import TwitterApiArchiver +# from .twitter_api_archiver import TwitterApiArchiver from .instagram_archiver import InstagramArchiver -from .telethon_archiverv2 import TelethonArchiver \ No newline at end of file +from .telethon_archiverv2 import TelethonArchiver +from .twitter_archiverv2 import TwitterArchiver +from .twitter_api_archiverv2 import TwitterApiArchiver \ No newline at end of file diff --git a/src/archivers/telethon_archiverv2.py b/src/archivers/telethon_archiverv2.py index 819070a..90de5da 100644 --- a/src/archivers/telethon_archiverv2.py +++ b/src/archivers/telethon_archiverv2.py @@ -13,7 +13,7 @@ from media import Media class TelethonArchiver(Archiverv2): - name = "telethon" + name = "telethon_archiver" link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)") invite_pattern = re.compile(r"t.me(\/joinchat){0,1}\/\+?(.+)") @@ -145,8 +145,8 @@ class TelethonArchiver(Archiverv2): continue result.add_media(Media(filename)) - result.set("post", str(post)).set_title(title).set_timestamp(post.date) - return result + result.set_content(str(post)).set_title(title).set_timestamp(post.date) + return result.success("telethon") def _get_media_posts_in_group(self, chat, original_post, max_amp=10): """ diff --git a/src/archivers/twitter_api_archiverv2.py b/src/archivers/twitter_api_archiverv2.py new file mode 100644 index 0000000..c95795a --- /dev/null +++ b/src/archivers/twitter_api_archiverv2.py @@ -0,0 +1,97 @@ + +import json +from datetime import datetime +import mimetypes +import os +from loguru import logger +from pytwitter import Api +from slugify import slugify + +from metadata import Metadata +from media import Media +from .twitter_archiverv2 import TwitterArchiver +from .archiver import Archiverv2 + + +class TwitterApiArchiver(TwitterArchiver, Archiverv2): + name = "twitter_api_archiver" + + def __init__(self, config: dict) -> None: + super().__init__(config) + + if self.bearer_token: + self.api = Api(bearer_token=self.bearer_token) + elif self.consumer_key and self.consumer_secret and self.access_token and self.access_secret: + self.api = Api( + consumer_key=self.consumer_key, consumer_secret=self.consumer_secret, access_token=self.access_token, access_secret=self.access_secret) + assert hasattr(self, "api") and self.api is not None, "Missing Twitter API configurations, please provide either bearer_token OR (consumer_key, consumer_secret, access_token, access_secret) to use this archiver." + + @staticmethod + def configs() -> dict: + return { + "bearer_token": {"default": None, "help": "twitter API bearer_token which is enough for archiving, if not provided you will need consumer_key, consumer_secret, access_token, access_secret"}, + "consumer_key": {"default": None, "help": "twitter API consumer_key"}, + "consumer_secret": {"default": None, "help": "twitter API consumer_secret"}, + "access_token": {"default": None, "help": "twitter API access_token"}, + "access_secret": {"default": None, "help": "twitter API access_secret"}, + } + + def download(self, item: Metadata) -> Metadata: + url = item.get_url() + # detect URLs that we definitely cannot handle + username, tweet_id = self.get_username_tweet_id(url) + if not username: return False + + try: + tweet = self.api.get_tweet(tweet_id, expansions=["attachments.media_keys"], media_fields=["type", "duration_ms", "url", "variants"], tweet_fields=["attachments", "author_id", "created_at", "entities", "id", "text", "possibly_sensitive"]) + except Exception as e: + logger.error(f"Could not get tweet: {e}") + return False + + result = Metadata() + result.set_title(tweet.data.text) + result.set_timestamp(datetime.strptime(tweet.data.created_at, "%Y-%m-%dT%H:%M:%S.%fZ")) + + urls = [] + if tweet.includes: + for i, m in enumerate(tweet.includes.media): + media = Media(filename="") + if m.url and len(m.url): + media.set("src", m.url) + media.set("duration", (m.duration_ms or 1) // 1000) + mimetype = "image/jpeg" + elif hasattr(m, "variants"): + variant = self.choose_variant(m.variants) + if not variant: continue + media.set("src", variant.url) + mimetype = variant.content_type + else: + continue + logger.info(f"Found media {media}") + ext = mimetypes.guess_extension(mimetype) + media.filename = os.path.join(item.get_tmp_dir(), f'{slugify(url)}_{i}{ext}') + self.download_from_url(media.get("src"), media.filename) + result.add_media(media) + + result.set_content(json.dumps({ + "id": tweet.data.id, + "text": tweet.data.text, + "created_at": tweet.data.created_at, + "author_id": tweet.data.author_id, + "geo": tweet.data.geo, + "lang": tweet.data.lang, + "media": urls + }, ensure_ascii=False, indent=4)) + return result.success("twitter") + + def choose_variant(self, variants): + # choosing the highest quality possible + variant, bit_rate = None, -1 + for var in variants: + if var.content_type == "video/mp4": + if var.bit_rate > bit_rate: + bit_rate = var.bit_rate + variant = var + else: + variant = var if not variant else variant + return variant diff --git a/src/archivers/twitter_archiverv2.py b/src/archivers/twitter_archiverv2.py new file mode 100644 index 0000000..f23fa0f --- /dev/null +++ b/src/archivers/twitter_archiverv2.py @@ -0,0 +1,137 @@ +import html, re, requests +import mimetypes +import json +import os +from datetime import datetime +from loguru import logger +from metadata import Metadata +from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo +from archivers import Archiverv2 +from media import Media +from slugify import slugify + + +class TwitterArchiver(Archiverv2): + """ + This Twitter Archiver uses unofficial scraping methods. + """ + + name = "twitter_archiver" + link_pattern = re.compile(r"twitter.com\/(?:\#!\/)?(\w+)\/status(?:es)?\/(\d+)") + + def __init__(self, config: dict) -> None: + super().__init__(config) + + @staticmethod + def configs() -> dict: + return {} + + def download(self, item: Metadata) -> Metadata: + """ + if this url is archivable will download post info and look for other posts from the same group with media. + can handle private/public channels + """ + url = item.get_url() + # detect URLs that we definitely cannot handle + username, tweet_id = self.get_username_tweet_id(url) + if not username: return False + + result = Metadata() + + scr = TwitterTweetScraper(tweet_id) + try: + tweet = next(scr.get_items()) + except Exception as ex: + logger.warning(f"can't get tweet: {type(ex).__name__} occurred. args: {ex.args}") + return self.download_alternative(item, url, tweet_id) + + result.set_title(tweet.content).set_content(tweet.json()).set_timestamp(tweet.date) + if tweet.media is None: + logger.debug(f'No media found, archiving tweet text only') + return result + + for i, tweet_media in enumerate(tweet.media): + media = Media(filename="") + mimetype = "" + if type(tweet_media) == Video: + variant = max( + [v for v in tweet_media.variants if v.bitrate], key=lambda v: v.bitrate) + media.set("src", variant.url).set("duration", tweet_media.duration) + mimetype = variant.contentType + elif type(tweet_media) == Gif: + variant = tweet_media.variants[0] + media.set("src", variant.url) + mimetype = variant.contentType + elif type(tweet_media) == Photo: + media.set("src", tweet_media.fullUrl.replace('name=large', 'name=orig')) + mimetype = "image/jpeg" + else: + logger.warning(f"Could not get media URL of {tweet_media}") + continue + ext = mimetypes.guess_extension(mimetype) + media.filename = os.path.join(item.get_tmp_dir(), f'{slugify(url)}_{i}{ext}') + self.download_from_url(media.get("src"), media.filename) + result.add_media(media) + + return result.success("twitter") + + def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metadata: + """ + CURRENTLY STOPPED WORKING + """ + return False + # https://stackoverflow.com/a/71867055/6196010 + logger.debug(f"Trying twitter hack for {url=}") + result = Metadata() + + hack_url = f"https://cdn.syndication.twimg.com/tweet?id={tweet_id}" + r = requests.get(hack_url) + if r.status_code != 200: return False + tweet = r.json() + + urls = [] + for p in tweet["photos"]: + urls.append(p["url"]) + + # 1 tweet has 1 video max + if "video" in tweet: + v = tweet["video"] + urls.append(self.choose_variant(v.get("variants", []))) + + logger.debug(f"Twitter hack got {urls=}") + + for u in urls: + media = Media() + media.set("src", u) + media.filename = os.path.join(item.get_tmp_dir(), f'{slugify(url)}_{i}') + self.download_from_url(u, media.filename) + result.add_media(media) + + # .set_title(tweet["TODO"]) + result.set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")) + return result + + def get_username_tweet_id(self, url): + # detect URLs that we definitely cannot handle + matches = self.link_pattern.findall(url) + if not len(matches): return False, False + + username, tweet_id = matches[0] # only one URL supported + logger.debug(f"Found {username=} and {tweet_id=} in {url=}") + + return username, tweet_id + + def choose_variant(self, variants): + # choosing the highest quality possible + variant, width, height = None, 0, 0 + for var in variants: + if var.get("type", "") == "video/mp4": + width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"]) + if width_height: + w, h = int(width_height[1]), int(width_height[2]) + if w > width or h > height: + width, height = w, h + variant = var.get("src", variant) + else: + variant = var.get("src") if not variant else variant + return variant diff --git a/src/enrichers/thumbnail_enricher.py b/src/enrichers/thumbnail_enricher.py index 32e09be..94c5ee7 100644 --- a/src/enrichers/thumbnail_enricher.py +++ b/src/enrichers/thumbnail_enricher.py @@ -25,6 +25,7 @@ class ThumbnailEnricher(Enricher): folder = os.path.join(to_enrich.get_tmp_dir(), str(uuid.uuid4())) os.makedirs(folder, exist_ok=True) for i, m in enumerate(to_enrich.media[::]): + logger.info(m) if m.is_video(): logger.debug(f"generating thumbnails for {m.filename}") fps, duration = 0.5, m.get("duration") diff --git a/src/formatters/templates/html_template.html b/src/formatters/templates/html_template.html index 3d99a41..9c3b54e 100644 --- a/src/formatters/templates/html_template.html +++ b/src/formatters/templates/html_template.html @@ -1,5 +1,5 @@ {# templates/results.html #} -{% import 'media.html' as macros %} +{% import 'macros.html' as macros %} @@ -133,8 +133,8 @@
- {% elif m.properties[prop] | length > 1 %} -