Adds new extractor for tiktok via unofficial API (#237)

* minor update to defaults in api_db * readme typo * adds and tests new tikwm tiktok downloader * addresses PR comments
2025-03-10 17:56:45 +06:00 · 2025-03-10 17:56:45 +06:00 · 58bd38e292
commit 58bd38e292
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@



-Auto Archiver is a Python tool to automatically archive content on the web in a secure and verifiable way. It takes URLs from different sources (e.g. a CSV file, Google Sheets, command line etc.) and archives the content of each one. It can archive social media posts, videos, images and webpages. Content can enriched, then saved either locally or remotely (S3 bucket, Google Drive). The status of the archiving process can be appended to a CSV report, or if using Google Sheets – back to the original sheet.
+Auto Archiver is a Python tool to automatically archive content on the web in a secure and verifiable way. It takes URLs from different sources (e.g. a CSV file, Google Sheets, command line etc.) and archives the content of each one. It can archive social media posts, videos, images and webpages. Content can be enriched, then saved either locally or remotely (S3 bucket, Google Drive). The status of the archiving process can be appended to a CSV report, or if using Google Sheets – back to the original sheet.

 <div class="hidden_rtd">
  
--- a/src/auto_archiver/modules/api_db/manifest.py
+++ b/src/auto_archiver/modules/api_db/manifest.py
@ -24,9 +24,9 @@
            "help": "which group of users have access to the archive in case public=false as author",
        },
        "use_api_cache": {
-            "default": True,
+            "default": False,
            "type": "bool",
-            "help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
+            "help": "if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
        },
        "store_results": {
            "default": True,
--- a/src/auto_archiver/modules/tiktok_tikwm_extractor/init.py
+++ b/src/auto_archiver/modules/tiktok_tikwm_extractor/init.py
@ -0,0 +1 @@
+from .tiktok_tikwm_extractor import TiktokTikwmExtractor
--- a/src/auto_archiver/modules/tiktok_tikwm_extractor/manifest.py
+++ b/src/auto_archiver/modules/tiktok_tikwm_extractor/manifest.py
@ -0,0 +1,23 @@
+{
+    "name": "Tiktok Tikwm Extractor",
+    "type": ["extractor"],
+    "requires_setup": False,
+    "dependencies": {
+        "python": ["loguru", "requests"],
+        "bin": []
+    },
+    "description": """
+    Uses an unofficial TikTok video download platform's API to download videos: https://tikwm.com/
+	
+	This extractor complements the generic_extractor which can already get TikTok videos, but this one can extract special videos like those marked as sensitive.
+
+    ### Features
+    - Downloads the video and, if possible, also the video cover.
+	- Stores extra metadata about the post like author information, and more as returned by tikwm.com. 
+
+    ### Notes
+    - If tikwm.com is down, this extractor will not work.
+	- If tikwm.com changes their API, this extractor may break.
+	- If no video is found, this extractor will consider the extraction failed.
+    """
+}
--- a/src/auto_archiver/modules/tiktok_tikwm_extractor/tiktok_tikwm_extractor.py
+++ b/src/auto_archiver/modules/tiktok_tikwm_extractor/tiktok_tikwm_extractor.py
@ -0,0 +1,75 @@
+import re
+import requests
+from loguru import logger
+from datetime import datetime, timezone
+from yt_dlp.extractor.tiktok import TikTokIE
+
+from auto_archiver.core import Extractor
+from auto_archiver.core import Metadata, Media
+
+
+class TiktokTikwmExtractor(Extractor):
+    """
+    Extractor for TikTok that uses an unofficial API and can capture content that requires a login, like sensitive content.
+    """
+    TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
+
+    def download(self, item: Metadata) -> Metadata:
+        url = item.get_url()
+        
+        if not re.match(TikTokIE._VALID_URL, url):
+            return False
+
+        endpoint = TiktokTikwmExtractor.TIKWM_ENDPOINT.format(url=url)
+
+        r = requests.get(endpoint)
+        if r.status_code != 200:
+            logger.error(f"unexpected status code '{r.status_code}' from tikwm.com for {url=}:")
+            return False
+
+        try:
+            json_response = r.json()
+        except ValueError:
+            logger.error(f"failed to parse JSON response from tikwm.com for {url=}")
+            return False
+
+        if not json_response.get('msg') == 'success' or not (api_data := json_response.get('data', {})):
+            logger.error(f"failed to get a valid response from tikwm.com for {url=}: {json_response}")
+            return False
+
+        # tries to get the non-watermarked version first
+        video_url = api_data.pop("play", api_data.pop("wmplay", None))
+        if not video_url:
+            logger.error(f"no valid video URL found in response from tikwm.com for {url=}")
+            return False
+
+        # prepare result, start by downloading video
+        result = Metadata()
+
+        # get the cover if possible
+        cover_url = api_data.pop("origin_cover", api_data.pop("cover", api_data.pop("ai_dynamic_cover", None)))
+        if cover_url and (cover_downloaded := self.download_from_url(cover_url)):
+            result.add_media(Media(cover_downloaded))
+
+        # get the video or fail
+        video_downloaded = self.download_from_url(video_url, f"vid_{api_data.get('id', '')}")
+        if not video_downloaded:
+            logger.error(f"failed to download video from {video_url}")
+            return False
+        video_media = Media(video_downloaded)
+        if duration := api_data.pop("duration", None):
+            video_media.set("duration", duration)
+        result.add_media(video_media)
+
+        # add remaining metadata
+        result.set_title(api_data.pop("title", ""))
+
+        if created_at := api_data.pop("create_time", None):
+            result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
+
+        if (author := api_data.pop("author", None)):
+            result.set("author", author)
+
+        result.set("api_data", api_data)
+
+        return result.success("tikwm")
--- a/tests/extractors/test_tiktok_tikwm_extractor.py
+++ b/tests/extractors/test_tiktok_tikwm_extractor.py
@ -0,0 +1,154 @@
+from datetime import datetime, timezone
+import time
+import pytest
+
+from auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor import TiktokTikwmExtractor
+from .test_extractor_base import TestExtractorBase
+
+
+class TestTiktokTikwmExtractor(TestExtractorBase):
+    """
+    Test suite for TestTiktokTikwmExtractor.
+    """
+
+    extractor_module = "tiktok_tikwm_extractor"
+    extractor: TiktokTikwmExtractor
+
+    config = {}
+
+    VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234"
+
+    @staticmethod
+    def get_mockers(mocker):
+        mock_get = mocker.patch("auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor.requests.get")
+        mock_logger = mocker.patch("auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor.logger")
+        return mock_get, mock_logger
+
+    @pytest.mark.parametrize("url,valid_url", [
+        ("https://bellingcat.com", False),
+        ("https://youtube.com", False),
+        ("https://tiktok.co/", False),
+        ("https://tiktok.com/", False),
+        ("https://www.tiktok.com/", False),
+        ("https://api.cool.tiktok.com/", False),
+        (VALID_EXAMPLE_URL, True),
+        ("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True),
+        ("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
+    ])
+    def test_valid_urls(self, mocker, make_item, url, valid_url):
+        mock_get, mock_logger = self.get_mockers(mocker)
+        if valid_url:
+            mock_get.return_value.status_code = 404
+        assert self.extractor.download(make_item(url)) == False
+        assert mock_get.call_count == int(valid_url)
+        assert mock_logger.error.call_count == int(valid_url)
+
+    def test_invalid_json_responses(self, mocker, make_item):
+        mock_get, mock_logger = self.get_mockers(mocker)
+        mock_get.return_value.status_code = 200
+        mock_get.return_value.json.side_effect = ValueError
+        assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) == False
+        mock_get.assert_called_once()
+        mock_get.return_value.json.assert_called_once()
+        mock_logger.error.assert_called_once()
+        assert mock_logger.error.call_args[0][0].startswith("failed to parse JSON response")
+
+        mock_get.return_value.json.side_effect = Exception
+        with pytest.raises(Exception):
+            self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
+        mock_get.assert_called()
+        assert mock_get.call_count == 2
+        assert mock_get.return_value.json.call_count == 2
+
+    @pytest.mark.parametrize("response", [
+        ({"msg": "failure"}),
+        ({"msg": "success"}),
+    ])
+    def test_unsuccessful_responses(self, mocker, make_item, response):
+        mock_get, mock_logger = self.get_mockers(mocker)
+        mock_get.return_value.status_code = 200
+        mock_get.return_value.json.return_value = response
+        assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) == False
+        mock_get.assert_called_once()
+        mock_get.return_value.json.assert_called_once()
+        mock_logger.error.assert_called_once()
+        assert mock_logger.error.call_args[0][0].startswith("failed to get a valid response")
+
+    @pytest.mark.parametrize("response,has_vid", [
+        ({"data": {"id": 123}}, False),
+        ({"data": {"wmplay": "url"}}, True),
+        ({"data": {"play": "url"}}, True),
+    ])
+    def test_correct_extraction(self, mocker, make_item, response, has_vid):
+        mock_get, mock_logger = self.get_mockers(mocker)
+        mock_get.return_value.status_code = 200
+        mock_get.return_value.json.return_value = {"msg": "success", **response}
+
+        result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
+        if not has_vid:
+            assert result == False
+        else:
+            assert result.is_success()
+            assert len(result.media) == 1
+        mock_get.assert_called()
+        assert mock_get.call_count == 1 + int(has_vid)
+        mock_get.return_value.json.assert_called_once()
+        if not has_vid:
+            mock_logger.error.assert_called_once()
+            assert mock_logger.error.call_args[0][0].startswith("no valid video URL found")
+        else:
+            mock_logger.error.assert_not_called()
+
+    def test_correct_extraction(self, mocker, make_item):
+        mock_get, _ = self.get_mockers(mocker)
+        mock_get.return_value.status_code = 200
+        mock_get.return_value.json.return_value = {"msg": "success", "data": {
+            "wmplay": "url",
+            "origin_cover": "cover.jpg",
+            "title": "Title",
+            "id": 123,
+            "duration": 60,
+            "create_time": 1736301699,
+            "author": "Author",
+            "other": "data"
+        }}
+
+        result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
+        assert result.is_success()
+        assert len(result.media) == 2
+        assert result.get_title() == "Title"
+        assert result.get("author") == "Author"
+        assert result.get("api_data") == {"other": "data", "id": 123}
+        assert result.media[1].get("duration") == 60
+        assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
+
+    @pytest.mark.download
+    def test_download_video(self, make_item):
+        url = "https://www.tiktok.com/@bbcnews/video/7478038212070411542"
+
+        result = self.extractor.download(make_item(url))
+        assert result.is_success()
+        assert len(result.media) == 2
+        assert result.get_title() == "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg  #A23a  #Antarctica  #Ice  #ClimateChange  #DavidAttenborough  #Ocean  #Sea  #SouthGeorgia  #BBCNews "
+        assert result.get("author").get("unique_id") == "bbcnews"
+        assert result.get("api_data").get("id") == '7478038212070411542'
+        assert result.media[1].get("duration") == 59
+        assert result.get("timestamp") == datetime.fromtimestamp(1741122000, tz=timezone.utc)
+
+    @pytest.mark.download
+    def test_download_sensitive_video(self, make_item, mock_sleep):
+        # sleep is needed because of the rate limit
+        mock_sleep.stop()
+        time.sleep(1.1)
+        mock_sleep.start()
+
+        url = "https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375"
+
+        result = self.extractor.download(make_item(url))
+        assert result.is_success()
+        assert len(result.media) == 2
+        assert result.get_title() == "Căng nhất lúc này #ggs68 #ggs68taiwan #taiwan #dailoan #tiktoknews"
+        assert result.get("author").get("id") == "7197400619475649562"
+        assert result.get("api_data").get("id") == '7441821351142362375'
+        assert result.media[1].get("duration") == 34
+        assert result.get("timestamp") == datetime.fromtimestamp(1732684060, tz=timezone.utc)
				`@ -0,0 +1 @@`
				`from .tiktok_tikwm_extractor import TiktokTikwmExtractor`