Adds new extractor for tiktok via unofficial API (#237)

* minor update to defaults in api_db

* readme typo

* adds and tests new tikwm tiktok downloader

* addresses PR comments
pull/249/head
Miguel Sozinho Ramalho 2025-03-10 17:56:45 +06:00 zatwierdzone przez GitHub
rodzic ce46a8a7ac
commit 58bd38e292
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
6 zmienionych plików z 256 dodań i 3 usunięć

Wyświetl plik

@ -10,7 +10,7 @@
Auto Archiver is a Python tool to automatically archive content on the web in a secure and verifiable way. It takes URLs from different sources (e.g. a CSV file, Google Sheets, command line etc.) and archives the content of each one. It can archive social media posts, videos, images and webpages. Content can enriched, then saved either locally or remotely (S3 bucket, Google Drive). The status of the archiving process can be appended to a CSV report, or if using Google Sheets – back to the original sheet.
Auto Archiver is a Python tool to automatically archive content on the web in a secure and verifiable way. It takes URLs from different sources (e.g. a CSV file, Google Sheets, command line etc.) and archives the content of each one. It can archive social media posts, videos, images and webpages. Content can be enriched, then saved either locally or remotely (S3 bucket, Google Drive). The status of the archiving process can be appended to a CSV report, or if using Google Sheets – back to the original sheet.
<div class="hidden_rtd">

Wyświetl plik

@ -24,9 +24,9 @@
"help": "which group of users have access to the archive in case public=false as author",
},
"use_api_cache": {
"default": True,
"default": False,
"type": "bool",
"help": "if False then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
"help": "if True then the API database will be queried prior to any archiving operations and stop if the link has already been archived",
},
"store_results": {
"default": True,

Wyświetl plik

@ -0,0 +1 @@
from .tiktok_tikwm_extractor import TiktokTikwmExtractor

Wyświetl plik

@ -0,0 +1,23 @@
{
"name": "Tiktok Tikwm Extractor",
"type": ["extractor"],
"requires_setup": False,
"dependencies": {
"python": ["loguru", "requests"],
"bin": []
},
"description": """
Uses an unofficial TikTok video download platform's API to download videos: https://tikwm.com/
This extractor complements the generic_extractor which can already get TikTok videos, but this one can extract special videos like those marked as sensitive.
### Features
- Downloads the video and, if possible, also the video cover.
- Stores extra metadata about the post like author information, and more as returned by tikwm.com.
### Notes
- If tikwm.com is down, this extractor will not work.
- If tikwm.com changes their API, this extractor may break.
- If no video is found, this extractor will consider the extraction failed.
"""
}

Wyświetl plik

@ -0,0 +1,75 @@
import re
import requests
from loguru import logger
from datetime import datetime, timezone
from yt_dlp.extractor.tiktok import TikTokIE
from auto_archiver.core import Extractor
from auto_archiver.core import Metadata, Media
class TiktokTikwmExtractor(Extractor):
"""
Extractor for TikTok that uses an unofficial API and can capture content that requires a login, like sensitive content.
"""
TIKWM_ENDPOINT = "https://www.tikwm.com/api/?url={url}"
def download(self, item: Metadata) -> Metadata:
url = item.get_url()
if not re.match(TikTokIE._VALID_URL, url):
return False
endpoint = TiktokTikwmExtractor.TIKWM_ENDPOINT.format(url=url)
r = requests.get(endpoint)
if r.status_code != 200:
logger.error(f"unexpected status code '{r.status_code}' from tikwm.com for {url=}:")
return False
try:
json_response = r.json()
except ValueError:
logger.error(f"failed to parse JSON response from tikwm.com for {url=}")
return False
if not json_response.get('msg') == 'success' or not (api_data := json_response.get('data', {})):
logger.error(f"failed to get a valid response from tikwm.com for {url=}: {json_response}")
return False
# tries to get the non-watermarked version first
video_url = api_data.pop("play", api_data.pop("wmplay", None))
if not video_url:
logger.error(f"no valid video URL found in response from tikwm.com for {url=}")
return False
# prepare result, start by downloading video
result = Metadata()
# get the cover if possible
cover_url = api_data.pop("origin_cover", api_data.pop("cover", api_data.pop("ai_dynamic_cover", None)))
if cover_url and (cover_downloaded := self.download_from_url(cover_url)):
result.add_media(Media(cover_downloaded))
# get the video or fail
video_downloaded = self.download_from_url(video_url, f"vid_{api_data.get('id', '')}")
if not video_downloaded:
logger.error(f"failed to download video from {video_url}")
return False
video_media = Media(video_downloaded)
if duration := api_data.pop("duration", None):
video_media.set("duration", duration)
result.add_media(video_media)
# add remaining metadata
result.set_title(api_data.pop("title", ""))
if created_at := api_data.pop("create_time", None):
result.set_timestamp(datetime.fromtimestamp(created_at, tz=timezone.utc))
if (author := api_data.pop("author", None)):
result.set("author", author)
result.set("api_data", api_data)
return result.success("tikwm")

Wyświetl plik

@ -0,0 +1,154 @@
from datetime import datetime, timezone
import time
import pytest
from auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor import TiktokTikwmExtractor
from .test_extractor_base import TestExtractorBase
class TestTiktokTikwmExtractor(TestExtractorBase):
"""
Test suite for TestTiktokTikwmExtractor.
"""
extractor_module = "tiktok_tikwm_extractor"
extractor: TiktokTikwmExtractor
config = {}
VALID_EXAMPLE_URL = "https://www.tiktok.com/@example/video/1234"
@staticmethod
def get_mockers(mocker):
mock_get = mocker.patch("auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor.requests.get")
mock_logger = mocker.patch("auto_archiver.modules.tiktok_tikwm_extractor.tiktok_tikwm_extractor.logger")
return mock_get, mock_logger
@pytest.mark.parametrize("url,valid_url", [
("https://bellingcat.com", False),
("https://youtube.com", False),
("https://tiktok.co/", False),
("https://tiktok.com/", False),
("https://www.tiktok.com/", False),
("https://api.cool.tiktok.com/", False),
(VALID_EXAMPLE_URL, True),
("https://www.tiktok.com/@bbcnews/video/7478038212070411542", True),
("https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375", True),
])
def test_valid_urls(self, mocker, make_item, url, valid_url):
mock_get, mock_logger = self.get_mockers(mocker)
if valid_url:
mock_get.return_value.status_code = 404
assert self.extractor.download(make_item(url)) == False
assert mock_get.call_count == int(valid_url)
assert mock_logger.error.call_count == int(valid_url)
def test_invalid_json_responses(self, mocker, make_item):
mock_get, mock_logger = self.get_mockers(mocker)
mock_get.return_value.status_code = 200
mock_get.return_value.json.side_effect = ValueError
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) == False
mock_get.assert_called_once()
mock_get.return_value.json.assert_called_once()
mock_logger.error.assert_called_once()
assert mock_logger.error.call_args[0][0].startswith("failed to parse JSON response")
mock_get.return_value.json.side_effect = Exception
with pytest.raises(Exception):
self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
mock_get.assert_called()
assert mock_get.call_count == 2
assert mock_get.return_value.json.call_count == 2
@pytest.mark.parametrize("response", [
({"msg": "failure"}),
({"msg": "success"}),
])
def test_unsuccessful_responses(self, mocker, make_item, response):
mock_get, mock_logger = self.get_mockers(mocker)
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = response
assert self.extractor.download(make_item(self.VALID_EXAMPLE_URL)) == False
mock_get.assert_called_once()
mock_get.return_value.json.assert_called_once()
mock_logger.error.assert_called_once()
assert mock_logger.error.call_args[0][0].startswith("failed to get a valid response")
@pytest.mark.parametrize("response,has_vid", [
({"data": {"id": 123}}, False),
({"data": {"wmplay": "url"}}, True),
({"data": {"play": "url"}}, True),
])
def test_correct_extraction(self, mocker, make_item, response, has_vid):
mock_get, mock_logger = self.get_mockers(mocker)
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = {"msg": "success", **response}
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
if not has_vid:
assert result == False
else:
assert result.is_success()
assert len(result.media) == 1
mock_get.assert_called()
assert mock_get.call_count == 1 + int(has_vid)
mock_get.return_value.json.assert_called_once()
if not has_vid:
mock_logger.error.assert_called_once()
assert mock_logger.error.call_args[0][0].startswith("no valid video URL found")
else:
mock_logger.error.assert_not_called()
def test_correct_extraction(self, mocker, make_item):
mock_get, _ = self.get_mockers(mocker)
mock_get.return_value.status_code = 200
mock_get.return_value.json.return_value = {"msg": "success", "data": {
"wmplay": "url",
"origin_cover": "cover.jpg",
"title": "Title",
"id": 123,
"duration": 60,
"create_time": 1736301699,
"author": "Author",
"other": "data"
}}
result = self.extractor.download(make_item(self.VALID_EXAMPLE_URL))
assert result.is_success()
assert len(result.media) == 2
assert result.get_title() == "Title"
assert result.get("author") == "Author"
assert result.get("api_data") == {"other": "data", "id": 123}
assert result.media[1].get("duration") == 60
assert result.get("timestamp") == datetime.fromtimestamp(1736301699, tz=timezone.utc)
@pytest.mark.download
def test_download_video(self, make_item):
url = "https://www.tiktok.com/@bbcnews/video/7478038212070411542"
result = self.extractor.download(make_item(url))
assert result.is_success()
assert len(result.media) == 2
assert result.get_title() == "The A23a iceberg is one of the world's oldest and it's so big you can see it from space. #Iceberg #A23a #Antarctica #Ice #ClimateChange #DavidAttenborough #Ocean #Sea #SouthGeorgia #BBCNews "
assert result.get("author").get("unique_id") == "bbcnews"
assert result.get("api_data").get("id") == '7478038212070411542'
assert result.media[1].get("duration") == 59
assert result.get("timestamp") == datetime.fromtimestamp(1741122000, tz=timezone.utc)
@pytest.mark.download
def test_download_sensitive_video(self, make_item, mock_sleep):
# sleep is needed because of the rate limit
mock_sleep.stop()
time.sleep(1.1)
mock_sleep.start()
url = "https://www.tiktok.com/@ggs68taiwan.official/video/7441821351142362375"
result = self.extractor.download(make_item(url))
assert result.is_success()
assert len(result.media) == 2
assert result.get_title() == "Căng nhất lúc này #ggs68 #ggs68taiwan #taiwan #dailoan #tiktoknews"
assert result.get("author").get("id") == "7197400619475649562"
assert result.get("api_data").get("id") == '7441821351142362375'
assert result.media[1].get("duration") == 34
assert result.get("timestamp") == datetime.fromtimestamp(1732684060, tz=timezone.utc)