Add yt-dlp based archiving for TwitterArchiver (#138)

* Add ytdlp archiving capability * Add type annotation * version bump --------- Co-authored-by: msramalho <19508417+msramalho@users.noreply.github.com>
2024-04-16 02:54:55 +08:00 · 2024-04-16 02:54:55 +08:00 · cf8691bad7
commit cf8691bad7
--- a/src/auto_archiver/archivers/twitter_archiver.py
+++ b/src/auto_archiver/archivers/twitter_archiver.py
@ -2,6 +2,8 @@ import re, requests, mimetypes, json
 from datetime import datetime
 from loguru import logger
 from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
+from yt_dlp import YoutubeDL
+from yt_dlp.extractor.twitter import TwitterIE
 from slugify import slugify

 from . import Archiver
@ -98,7 +100,9 @@ class TwitterArchiver(Archiver):

        hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
        r = requests.get(hack_url)
-        if r.status_code != 200: return False
+        if r.status_code != 200 or r.json()=={}: 
+            logger.warning(f"Failed to get tweet information from {hack_url}, trying ytdl")
+            return self.download_ytdl(item, url, tweet_id)
        tweet = r.json()

        urls = []
@ -108,7 +112,7 @@ class TwitterArchiver(Archiver):
        # 1 tweet has 1 video max
        if "video" in tweet:
            v = tweet["video"]
-            urls.append(self.choose_variant(v.get("variants", [])))
+            urls.append(self.choose_variant(v.get("variants", []))['url'])

        logger.debug(f"Twitter hack got {urls=}")

@ -125,6 +129,38 @@ class TwitterArchiver(Archiver):

        result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
        return result.success("twitter-hack")
+    
+    def download_ytdl(self, item: Metadata, url:str, tweet_id:str) -> Metadata:
+        downloader = YoutubeDL()
+        tie = TwitterIE(downloader)
+        tweet = tie._extract_status(tweet_id)
+        result = Metadata()
+        result\
+            .set_title(tweet.get('full_text', ''))\
+            .set_content(json.dumps(tweet, ensure_ascii=False))\
+            .set_timestamp(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"))
+        if not tweet.get("entities", {}).get("media"):
+            logger.debug('No media found, archiving tweet text only')
+            return result
+        for i, tw_media in enumerate(tweet["entities"]["media"]):
+            media = Media(filename="")
+            mimetype = ""
+            if tw_media["type"] == "photo":
+                media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
+                mimetype = "image/jpeg"
+            elif tw_media["type"] == "video":
+                variant = self.choose_variant(tw_media['video_info']['variants'])
+                media.set("src", variant['url'])
+                mimetype = variant['content_type']
+            elif tw_media["type"] == "animated_gif":
+                variant = tw_media['video_info']['variants'][0]
+                media.set("src", variant['url'])
+                mimetype = variant['content_type']
+            ext = mimetypes.guess_extension(mimetype)
+            media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
+            result.add_media(media)
+        return result.success("twitter-ytdl")
+        

    def get_username_tweet_id(self, url):
        # detect URLs that we definitely cannot handle
@ -140,13 +176,13 @@ class TwitterArchiver(Archiver):
        # choosing the highest quality possible
        variant, width, height = None, 0, 0
        for var in variants:
-            if var.get("type", "") == "video/mp4":
-                width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"])
+            if var.get("content_type", "") == "video/mp4":
+                width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
                if width_height:
                    w, h = int(width_height[1]), int(width_height[2])
                    if w > width or h > height:
                        width, height = w, h
-                        variant = var.get("src", variant)
+                        variant = var
            else:
-                variant = var.get("src") if not variant else variant
+                variant = var if not variant else variant
        return variant
--- a/src/auto_archiver/version.py
+++ b/src/auto_archiver/version.py
@ -3,7 +3,7 @@ _MAJOR = "0"
 _MINOR = "11"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "1"
+_PATCH = "2"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""