Porównaj commity

...

9 Commity

Autor SHA1 Wiadomość Data
bashonly 0ab8d57561
Merge 8cc031f259 into ff38a011d5 2024-04-22 04:47:21 +03:00
bashonly ff38a011d5
[ie/crunchyroll] Fix auth and remove cookies support (#9749)
Closes #9745
Authored by: bashonly
2024-04-21 22:41:40 +00:00
bashonly 8056a3026e
[ie/theatercomplextown] Fix extractors (#9754)
Authored by: bashonly
2024-04-21 16:05:42 +00:00
Simon Sawicki 3ee1194288
[ie] Make `_search_nextjs_data` non fatal (#8937)
Authored by: Grub4K
2024-04-21 13:40:38 +02:00
bashonly e3b42d8b1b
[ie/facebook] Fix DASH formats extraction (#9734)
Closes #9720
Authored by: bashonly
2024-04-20 10:23:12 +00:00
bashonly c9ce57d9bf
[ie/patreon] Fix Vimeo embed extraction (#9712)
Fixes regression in 36b240f9a7

Closes #9709
Authored by: bashonly
2024-04-18 23:18:56 +00:00
bashonly 02483bea1c
[build] Normalize `curl_cffi` group to `curl-cffi` (#9698)
Closes #9682
Authored by: bashonly
2024-04-18 23:11:12 +00:00
bashonly 8cc031f259
add prefix url test
Authored by: bashonly
2024-04-10 10:30:38 -05:00
bashonly 9d37b9e298
[ie/tiktok:user] Fix extractor
Authored by: bashonly
2024-04-10 10:26:03 -05:00
13 zmienionych plików z 258 dodań i 159 usunięć

Wyświetl plik

@ -254,7 +254,7 @@ jobs:
# We need to fuse our own universal2 wheels for curl_cffi # We need to fuse our own universal2 wheels for curl_cffi
python3 -m pip install -U --user delocate python3 -m pip install -U --user delocate
mkdir curl_cffi_whls curl_cffi_universal2 mkdir curl_cffi_whls curl_cffi_universal2
python3 devscripts/install_deps.py --print -o --include curl_cffi > requirements.txt python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt
for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do
python3 -m pip download \ python3 -m pip download \
--only-binary=:all: \ --only-binary=:all: \
@ -362,7 +362,7 @@ jobs:
- name: Install Requirements - name: Install Requirements
run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds
python devscripts/install_deps.py -o --include build python devscripts/install_deps.py -o --include build
python devscripts/install_deps.py --include py2exe --include curl_cffi python devscripts/install_deps.py --include py2exe --include curl-cffi
python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.8.0-py3-none-any.whl" python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.8.0-py3-none-any.whl"
- name: Prepare - name: Prepare

Wyświetl plik

@ -202,7 +202,7 @@ While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly
The following provide support for impersonating browser requests. This may be required for some sites that employ TLS fingerprinting. The following provide support for impersonating browser requests. This may be required for some sites that employ TLS fingerprinting.
* [**curl_cffi**](https://github.com/yifeikong/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/yifeikong/curl_cffi/blob/main/LICENSE) * [**curl_cffi**](https://github.com/yifeikong/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/yifeikong/curl_cffi/blob/main/LICENSE)
* Can be installed with the `curl_cffi` group, e.g. `pip install yt-dlp[default,curl_cffi]` * Can be installed with the `curl-cffi` group, e.g. `pip install yt-dlp[default,curl-cffi]`
* Currently only included in `yt-dlp.exe` and `yt-dlp_macos` builds * Currently only included in `yt-dlp.exe` and `yt-dlp_macos` builds

Wyświetl plik

@ -53,7 +53,7 @@ dependencies = [
[project.optional-dependencies] [project.optional-dependencies]
default = [] default = []
curl_cffi = ["curl-cffi==0.5.10; implementation_name=='cpython'"] curl-cffi = ["curl-cffi==0.5.10; implementation_name=='cpython'"]
secretstorage = [ secretstorage = [
"cffi", "cffi",
"secretstorage", "secretstorage",

Wyświetl plik

@ -1906,6 +1906,15 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
expected_status=TEAPOT_RESPONSE_STATUS) expected_status=TEAPOT_RESPONSE_STATUS)
self.assertEqual(content, TEAPOT_RESPONSE_BODY) self.assertEqual(content, TEAPOT_RESPONSE_BODY)
def test_search_nextjs_data(self):
data = '<script id="__NEXT_DATA__" type="application/json">{"props":{}}</script>'
self.assertEqual(self.ie._search_nextjs_data(data, None), {'props': {}})
self.assertEqual(self.ie._search_nextjs_data('', None, fatal=False), {})
self.assertEqual(self.ie._search_nextjs_data('', None, default=None), None)
self.assertEqual(self.ie._search_nextjs_data('', None, default={}), {})
with self.assertRaises(DeprecationWarning):
self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {})
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

Wyświetl plik

@ -105,7 +105,7 @@ class AsobiStageIE(InfoExtractor):
video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_] video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_]
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
event_data = traverse_obj( event_data = traverse_obj(
self._search_nextjs_data(webpage, video_id, default='{}'), self._search_nextjs_data(webpage, video_id, default={}),
('props', 'pageProps', 'eventCMSData', { ('props', 'pageProps', 'eventCMSData', {
'title': ('event_name', {str}), 'title': ('event_name', {str}),
'thumbnail': ('event_thumbnail_image', {url_or_none}), 'thumbnail': ('event_thumbnail_image', {url_or_none}),

Wyświetl plik

@ -1738,12 +1738,16 @@ class InfoExtractor:
traverse_json_ld(json_ld) traverse_json_ld(json_ld)
return filter_dict(info) return filter_dict(info)
def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw):
return self._parse_json( if default == '{}':
self._search_regex( self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead')
r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', default = {}
webpage, 'next.js data', fatal=fatal, **kw), if default is not NO_DEFAULT:
video_id, transform_source=transform_source, fatal=fatal) fatal = False
return self._search_json(
r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data',
video_id, end_pattern='</script>', fatal=fatal, default=default, **kw)
def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""

Wyświetl plik

@ -24,11 +24,15 @@ class CrunchyrollBaseIE(InfoExtractor):
_BASE_URL = 'https://www.crunchyroll.com' _BASE_URL = 'https://www.crunchyroll.com'
_API_BASE = 'https://api.crunchyroll.com' _API_BASE = 'https://api.crunchyroll.com'
_NETRC_MACHINE = 'crunchyroll' _NETRC_MACHINE = 'crunchyroll'
_REFRESH_TOKEN = None
_AUTH_HEADERS = None _AUTH_HEADERS = None
_AUTH_EXPIRY = None
_API_ENDPOINT = None _API_ENDPOINT = None
_BASIC_AUTH = None _BASIC_AUTH = 'Basic ' + base64.b64encode(':'.join((
't-kdgp2h8c3jub8fn0fq',
'yfLDfMfrYvKXh4JXS1LEI2cCqu1v5Wan',
)).encode()).decode()
_IS_PREMIUM = None _IS_PREMIUM = None
_CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q')
_LOCALE_LOOKUP = { _LOCALE_LOOKUP = {
'ar': 'ar-SA', 'ar': 'ar-SA',
'de': 'de-DE', 'de': 'de-DE',
@ -43,69 +47,74 @@ class CrunchyrollBaseIE(InfoExtractor):
'hi': 'hi-IN', 'hi': 'hi-IN',
} }
@property def _set_auth_info(self, response):
def is_logged_in(self): CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(response, ('access_token', {jwt_decode_hs256}, 'benefits', ...))
return bool(self._get_cookies(self._BASE_URL).get('etp_rt')) CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': response['token_type'] + ' ' + response['access_token']}
CrunchyrollBaseIE._AUTH_EXPIRY = time_seconds(seconds=traverse_obj(response, ('expires_in', {float_or_none}), default=300) - 10)
def _request_token(self, headers, data, note='Requesting token', errnote='Failed to request token'):
try: # TODO: Add impersonation support here
return self._download_json(
f'{self._BASE_URL}/auth/v1/token', None, note=note, errnote=errnote,
headers=headers, data=urlencode_postdata(data))
except ExtractorError as error:
if not isinstance(error.cause, HTTPError) or error.cause.status != 403:
raise
raise ExtractorError(
'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, '
'then pass the fresh cookies (with --cookies-from-browser or --cookies) '
'and your browser\'s User-Agent (with --user-agent)', expected=True)
def _perform_login(self, username, password): def _perform_login(self, username, password):
if self.is_logged_in: if not CrunchyrollBaseIE._REFRESH_TOKEN:
CrunchyrollBaseIE._REFRESH_TOKEN = self.cache.load(self._NETRC_MACHINE, username)
if CrunchyrollBaseIE._REFRESH_TOKEN:
return return
upsell_response = self._download_json(
f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id',
query={
'sess_id': 1,
'device_id': 'whatvalueshouldbeforweb',
'device_type': 'com.crunchyroll.static',
'access_token': 'giKq5eY27ny3cqz',
'referer': f'{self._BASE_URL}/welcome/login'
})
if upsell_response['code'] != 'ok':
raise ExtractorError('Could not get session id')
session_id = upsell_response['data']['session_id']
login_response = self._download_json(
f'{self._API_BASE}/login.1.json', None, 'Logging in',
data=urlencode_postdata({
'account': username,
'password': password,
'session_id': session_id
}))
if login_response['code'] != 'ok':
raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True)
if not self.is_logged_in:
raise ExtractorError('Login succeeded but did not set etp_rt cookie')
def _update_auth(self):
if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_REFRESH > time_seconds():
return
if not CrunchyrollBaseIE._BASIC_AUTH:
cx_api_param = self._CLIENT_ID[self.is_logged_in]
self.write_debug(f'Using cxApiParam={cx_api_param}')
CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode()
auth_headers = {'Authorization': CrunchyrollBaseIE._BASIC_AUTH}
if self.is_logged_in:
grant_type = 'etp_rt_cookie'
else:
grant_type = 'client_id'
auth_headers['ETP-Anonymous-ID'] = uuid.uuid4()
try: try:
auth_response = self._download_json( login_response = self._request_token(
f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', headers={'Authorization': self._BASIC_AUTH}, data={
headers=auth_headers, data=f'grant_type={grant_type}'.encode()) 'username': username,
'password': password,
'grant_type': 'password',
'scope': 'offline_access',
}, note='Logging in', errnote='Failed to log in')
except ExtractorError as error: except ExtractorError as error:
if isinstance(error.cause, HTTPError) and error.cause.status == 403: if isinstance(error.cause, HTTPError) and error.cause.status == 401:
raise ExtractorError( raise ExtractorError('Invalid username and/or password', expected=True)
'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, '
'then pass the fresh cookies (with --cookies-from-browser or --cookies) '
'and your browser\'s User-Agent (with --user-agent)', expected=True)
raise raise
CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(auth_response, ('access_token', {jwt_decode_hs256}, 'benefits', ...)) CrunchyrollBaseIE._REFRESH_TOKEN = login_response['refresh_token']
CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']} self.cache.store(self._NETRC_MACHINE, username, CrunchyrollBaseIE._REFRESH_TOKEN)
CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10) self._set_auth_info(login_response)
def _update_auth(self):
if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_EXPIRY > time_seconds():
return
auth_headers = {'Authorization': self._BASIC_AUTH}
if CrunchyrollBaseIE._REFRESH_TOKEN:
data = {
'refresh_token': CrunchyrollBaseIE._REFRESH_TOKEN,
'grant_type': 'refresh_token',
'scope': 'offline_access',
}
else:
data = {'grant_type': 'client_id'}
auth_headers['ETP-Anonymous-ID'] = uuid.uuid4()
try:
auth_response = self._request_token(auth_headers, data)
except ExtractorError as error:
username, password = self._get_login_info()
if not username or not isinstance(error.cause, HTTPError) or error.cause.status != 400:
raise
self.to_screen('Refresh token has expired. Re-logging in')
CrunchyrollBaseIE._REFRESH_TOKEN = None
self.cache.store(self._NETRC_MACHINE, username, None)
self._perform_login(username, password)
return
self._set_auth_info(auth_response)
def _locale_from_language(self, language): def _locale_from_language(self, language):
config_locale = self._configuration_arg('metadata', ie_key=CrunchyrollBetaIE, casesense=True) config_locale = self._configuration_arg('metadata', ie_key=CrunchyrollBetaIE, casesense=True)
@ -168,7 +177,8 @@ class CrunchyrollBaseIE(InfoExtractor):
self._update_auth() self._update_auth()
stream_response = self._download_json( stream_response = self._download_json(
f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play', f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play',
display_id, note='Downloading stream info', headers=CrunchyrollBaseIE._AUTH_HEADERS) display_id, note='Downloading stream info', errnote='Failed to download stream info',
headers=CrunchyrollBaseIE._AUTH_HEADERS)
available_formats = {'': ('', '', stream_response['url'])} available_formats = {'': ('', '', stream_response['url'])}
for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])): for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])):
@ -383,9 +393,9 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE):
if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')):
message = f'This {object_type} is for premium members only' message = f'This {object_type} is for premium members only'
if self.is_logged_in: if CrunchyrollBaseIE._REFRESH_TOKEN:
raise ExtractorError(message, expected=True) raise ExtractorError(message, expected=True)
self.raise_login_required(message) self.raise_login_required(message, method='password')
result['formats'], result['subtitles'] = self._extract_stream(internal_id) result['formats'], result['subtitles'] = self._extract_stream(internal_id)
@ -575,9 +585,9 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE):
if not self._IS_PREMIUM and response.get('isPremiumOnly'): if not self._IS_PREMIUM and response.get('isPremiumOnly'):
message = f'This {response.get("type") or "media"} is for premium members only' message = f'This {response.get("type") or "media"} is for premium members only'
if self.is_logged_in: if CrunchyrollBaseIE._REFRESH_TOKEN:
raise ExtractorError(message, expected=True) raise ExtractorError(message, expected=True)
self.raise_login_required(message) self.raise_login_required(message, method='password')
result = self._transform_music_response(response) result = self._transform_music_response(response)
result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id) result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id)

Wyświetl plik

@ -560,7 +560,7 @@ class FacebookIE(InfoExtractor):
js_data, lambda x: x['jsmods']['instances'], list) or []) js_data, lambda x: x['jsmods']['instances'], list) or [])
def extract_dash_manifest(video, formats): def extract_dash_manifest(video, formats):
dash_manifest = video.get('dash_manifest') dash_manifest = traverse_obj(video, 'dash_manifest', 'playlist', expected_type=str)
if dash_manifest: if dash_manifest:
formats.extend(self._parse_mpd_formats( formats.extend(self._parse_mpd_formats(
compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)),

Wyświetl plik

@ -1,8 +1,8 @@
import itertools import itertools
import urllib.parse
from .common import InfoExtractor from .common import InfoExtractor
from .vimeo import VimeoIE from .vimeo import VimeoIE
from ..compat import compat_urllib_parse_unquote
from ..networking.exceptions import HTTPError from ..networking.exceptions import HTTPError
from ..utils import ( from ..utils import (
KNOWN_EXTENSIONS, KNOWN_EXTENSIONS,
@ -14,7 +14,6 @@ from ..utils import (
parse_iso8601, parse_iso8601,
str_or_none, str_or_none,
traverse_obj, traverse_obj,
try_get,
url_or_none, url_or_none,
urljoin, urljoin,
) )
@ -199,6 +198,27 @@ class PatreonIE(PatreonBaseIE):
'channel_id': '2147162', 'channel_id': '2147162',
'uploader_url': 'https://www.patreon.com/yaboyroshi', 'uploader_url': 'https://www.patreon.com/yaboyroshi',
}, },
}, {
# NSFW vimeo embed URL
'url': 'https://www.patreon.com/posts/4k-spiderman-4k-96414599',
'info_dict': {
'id': '902250943',
'ext': 'mp4',
'title': '❤️(4K) Spiderman Girl Yeonhwas Gift ❤️(4K) 스파이더맨걸 연화의 선물',
'description': '❤️(4K) Spiderman Girl Yeonhwas Gift \n❤️(4K) 스파이더맨걸 연화의 선물',
'uploader': 'Npickyeonhwa',
'uploader_id': '90574422',
'uploader_url': 'https://www.patreon.com/Yeonhwa726',
'channel_id': '10237902',
'channel_url': 'https://www.patreon.com/Yeonhwa726',
'duration': 70,
'timestamp': 1705150153,
'upload_date': '20240113',
'comment_count': int,
'like_count': int,
'thumbnail': r're:^https?://.+',
},
'params': {'skip_download': 'm3u8'},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -268,16 +288,19 @@ class PatreonIE(PatreonBaseIE):
}) })
# handle Vimeo embeds # handle Vimeo embeds
if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo': if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo':
embed_html = try_get(attributes, lambda x: x['embed']['html']) v_url = urllib.parse.unquote(self._html_search_regex(
v_url = url_or_none(compat_urllib_parse_unquote( r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)',
self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False))) traverse_obj(attributes, ('embed', 'html', {str})), 'vimeo url', fatal=False) or '')
if v_url: if url_or_none(v_url) and self._request_webpage(
v_url = VimeoIE._smuggle_referrer(v_url, 'https://patreon.com') v_url, video_id, 'Checking Vimeo embed URL',
if self._request_webpage(v_url, video_id, 'Checking Vimeo embed URL', fatal=False, errnote=False): headers={'Referer': 'https://patreon.com/'},
return self.url_result(v_url, VimeoIE, url_transparent=True, **info) fatal=False, errnote=False):
return self.url_result(
VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'),
VimeoIE, url_transparent=True, **info)
embed_url = try_get(attributes, lambda x: x['embed']['url']) embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none}))
if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False): if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False):
return self.url_result(embed_url, **info) return self.url_result(embed_url, **info)

Wyświetl plik

@ -174,7 +174,7 @@ class TheaterComplexTownBaseIE(StacommuBaseIE):
class TheaterComplexTownVODIE(TheaterComplexTownBaseIE): class TheaterComplexTownVODIE(TheaterComplexTownBaseIE):
_VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:en/)?videos/episodes/(?P<id>\w+)' _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:(?:en|ja)/)?videos/episodes/(?P<id>\w+)'
IE_NAME = 'theatercomplextown:vod' IE_NAME = 'theatercomplextown:vod'
_TESTS = [{ _TESTS = [{
'url': 'https://www.theater-complex.town/videos/episodes/hoxqidYNoAn7bP92DN6p78', 'url': 'https://www.theater-complex.town/videos/episodes/hoxqidYNoAn7bP92DN6p78',
@ -195,6 +195,9 @@ class TheaterComplexTownVODIE(TheaterComplexTownBaseIE):
}, { }, {
'url': 'https://www.theater-complex.town/en/videos/episodes/6QT7XYwM9dJz5Gf9VB6K5y', 'url': 'https://www.theater-complex.town/en/videos/episodes/6QT7XYwM9dJz5Gf9VB6K5y',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.theater-complex.town/ja/videos/episodes/hoxqidYNoAn7bP92DN6p78',
'only_matching': True,
}] }]
_API_PATH = 'videoEpisodes' _API_PATH = 'videoEpisodes'
@ -204,7 +207,7 @@ class TheaterComplexTownVODIE(TheaterComplexTownBaseIE):
class TheaterComplexTownPPVIE(TheaterComplexTownBaseIE): class TheaterComplexTownPPVIE(TheaterComplexTownBaseIE):
_VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:en/)?ppv/(?P<id>\w+)' _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:(?:en|ja)/)?ppv/(?P<id>\w+)'
IE_NAME = 'theatercomplextown:ppv' IE_NAME = 'theatercomplextown:ppv'
_TESTS = [{ _TESTS = [{
'url': 'https://www.theater-complex.town/ppv/wytW3X7khrjJBUpKuV3jen', 'url': 'https://www.theater-complex.town/ppv/wytW3X7khrjJBUpKuV3jen',
@ -223,6 +226,9 @@ class TheaterComplexTownPPVIE(TheaterComplexTownBaseIE):
}, { }, {
'url': 'https://www.theater-complex.town/en/ppv/wytW3X7khrjJBUpKuV3jen', 'url': 'https://www.theater-complex.town/en/ppv/wytW3X7khrjJBUpKuV3jen',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://www.theater-complex.town/ja/ppv/qwUVmLmGEiZ3ZW6it9uGys',
'only_matching': True,
}] }]
_API_PATH = 'events' _API_PATH = 'events'

Wyświetl plik

@ -41,7 +41,7 @@ class STVPlayerIE(InfoExtractor):
ptype, video_id = self._match_valid_url(url).groups() ptype, video_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, video_id, fatal=False) or '' webpage = self._download_webpage(url, video_id, fatal=False) or ''
props = self._search_nextjs_data(webpage, video_id, default='{}').get('props') or {} props = self._search_nextjs_data(webpage, video_id, default={}).get('props') or {}
player_api_cache = try_get( player_api_cache = try_get(
props, lambda x: x['initialReduxState']['playerApiCache']) or {} props, lambda x: x['initialReduxState']['playerApiCache']) or {}

Wyświetl plik

@ -11,7 +11,6 @@ from ..compat import compat_urllib_parse_urlparse
from ..networking import HEADRequest from ..networking import HEADRequest
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
LazyList,
UnsupportedError, UnsupportedError,
UserNotLive, UserNotLive,
determine_ext, determine_ext,
@ -776,7 +775,7 @@ class TikTokIE(TikTokBaseIE):
status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0 status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0
video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict})) video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict}))
elif next_data := self._search_nextjs_data(webpage, video_id, default='{}'): elif next_data := self._search_nextjs_data(webpage, video_id, default={}):
self.write_debug('Found next.js data') self.write_debug('Found next.js data')
status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0 status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0
video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict})) video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict}))
@ -793,102 +792,150 @@ class TikTokIE(TikTokBaseIE):
class TikTokUserIE(TikTokBaseIE): class TikTokUserIE(TikTokBaseIE):
IE_NAME = 'tiktok:user' IE_NAME = 'tiktok:user'
_VALID_URL = r'https?://(?:www\.)?tiktok\.com/@(?P<id>[\w\.-]+)/?(?:$|[#?])' _VALID_URL = [
_WORKING = False r'https?://(?:www\.)?tiktok\.com/@(?P<id>[\w\.-]+)/?(?:$|[#?])',
r'tiktokuser:(?P<id>MS4wLjABAAAA[\w-]{64})',
]
_TESTS = [{ _TESTS = [{
'url': 'https://tiktok.com/@corgibobaa?lang=en', 'url': 'https://tiktok.com/@corgibobaa?lang=en',
'playlist_mincount': 45, 'playlist_mincount': 45,
'info_dict': { 'info_dict': {
'id': '6935371178089399301', 'id': 'MS4wLjABAAAAepiJKgwWhulvCpSuUVsp7sgVVsFJbbNaLeQ6OQ0oAJERGDUIXhb2yxxHZedsItgT',
'title': 'corgibobaa', 'title': 'corgibobaa',
'thumbnail': r're:https://.+_1080x1080\.webp'
}, },
'expected_warnings': ['Retrying']
}, { }, {
'url': 'https://www.tiktok.com/@6820838815978423302', 'url': 'https://www.tiktok.com/@6820838815978423302',
'playlist_mincount': 5, 'playlist_mincount': 5,
'info_dict': { 'info_dict': {
'id': '6820838815978423302', 'id': 'MS4wLjABAAAA0tF1nBwQVVMyrGu3CqttkNgM68Do1OXUFuCY0CRQk8fEtSVDj89HqoqvbSTmUP2W',
'title': '6820838815978423302', 'title': '6820838815978423302',
'thumbnail': r're:https://.+_1080x1080\.webp'
}, },
'expected_warnings': ['Retrying']
}, { }, {
'url': 'https://www.tiktok.com/@meme', 'url': 'https://www.tiktok.com/@meme',
'playlist_mincount': 593, 'playlist_mincount': 593,
'info_dict': { 'info_dict': {
'id': '79005827461758976', 'id': 'MS4wLjABAAAAiKfaDWeCsT3IHwY77zqWGtVRIy9v4ws1HbVi7auP1Vx7dJysU_hc5yRiGywojRD6',
'title': 'meme', 'title': 'meme',
'thumbnail': r're:https://.+_1080x1080\.webp'
}, },
'expected_warnings': ['Retrying'] }, {
'url': 'tiktokuser:MS4wLjABAAAAM3R2BtjzVT-uAtstkl2iugMzC6AtnpkojJbjiOdDDrdsTiTR75-8lyWJCY5VvDrZ',
'playlist_mincount': 31,
'info_dict': {
'id': 'MS4wLjABAAAAM3R2BtjzVT-uAtstkl2iugMzC6AtnpkojJbjiOdDDrdsTiTR75-8lyWJCY5VvDrZ',
},
}] }]
_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0'
_API_BASE_URL = 'https://www.tiktok.com/api/creator/item_list/'
r''' # TODO: Fix by adding _signature to api_url def _build_web_query(self, sec_uid, cursor):
def _entries(self, webpage, user_id, username): return {
secuid = self._search_regex(r'\"secUid\":\"(?P<secUid>[^\"]+)', webpage, username) 'aid': '1988',
verifyfp_cookie = self._get_cookies('https://www.tiktok.com').get('s_v_web_id') 'app_language': 'en',
if not verifyfp_cookie: 'app_name': 'tiktok_web',
raise ExtractorError('Improper cookies (missing s_v_web_id).', expected=True) 'browser_language': 'en-US',
api_url = f'https://m.tiktok.com/api/post/item_list/?aid=1988&cookie_enabled=true&count=30&verifyFp={verifyfp_cookie.value}&secUid={secuid}&cursor=' 'browser_name': 'Mozilla',
cursor = '0' 'browser_online': 'true',
for page in itertools.count(): 'browser_platform': 'Win32',
data_json = self._download_json(api_url + cursor, username, note='Downloading Page %d' % page) 'browser_version': '5.0 (Windows)',
for video in data_json.get('itemList', []): 'channel': 'tiktok_web',
video_id = video['id'] 'cookie_enabled': 'true',
video_url = f'https://www.tiktok.com/@{user_id}/video/{video_id}' 'count': '15',
yield self._url_result(video_url, 'TikTok', video_id, str_or_none(video.get('desc'))) 'cursor': cursor,
if not data_json.get('hasMore'): 'device_id': ''.join(random.choices(string.digits, k=19)),
break 'device_platform': 'web_pc',
cursor = data_json['cursor'] 'focus_state': 'true',
''' 'from_page': 'user',
'history_len': '2',
def _video_entries_api(self, webpage, user_id, username): 'is_fullscreen': 'false',
query = { 'is_page_visible': 'true',
'user_id': user_id, 'language': 'en',
'count': 21, 'os': 'windows',
'max_cursor': 0, 'priority_region': '',
'min_cursor': 0, 'referer': '',
'retry_type': 'no_retry', 'region': 'US',
'device_id': ''.join(random.choices(string.digits, k=19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. 'screen_height': '1080',
'screen_width': '1920',
'secUid': sec_uid,
'type': '1', # pagination type: 0 == oldest-to-newest, 1 == newest-to-oldest
'tz_name': 'UTC',
'verifyFp': 'verify_%s' % ''.join(random.choices(string.hexdigits, k=7)),
'webcast_language': 'en',
} }
def _entries(self, sec_uid, user_name):
cursor = int(time.time() * 1E3)
for page in itertools.count(1): for page in itertools.count(1):
for retry in self.RetryManager(): response = self._download_json(
try: self._API_BASE_URL, user_name or sec_uid, f'Downloading page {page}',
post_list = self._call_api( query=self._build_web_query(sec_uid, cursor), headers={'User-Agent': self._USER_AGENT})
'aweme/post', query, username, note=f'Downloading user video list page {page}',
errnote='Unable to download user video list')
except ExtractorError as e:
if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0:
retry.error = e
continue
raise
yield from post_list.get('aweme_list', [])
if not post_list.get('has_more'):
break
query['max_cursor'] = post_list['max_cursor']
def _entries_api(self, user_id, videos): for video in traverse_obj(response, ('itemList', lambda _, v: v['id'])):
for video in videos: video_id = video['id']
yield { webpage_url = self._create_url(user_name, video_id)
**self._parse_aweme_video_app(video), info = try_call(
'extractor_key': TikTokIE.ie_key(), lambda: self._parse_aweme_video_web(video, webpage_url, video_id)) or {'id': video_id}
'extractor': 'TikTok', info.pop('formats', None)
'webpage_url': f'https://tiktok.com/@{user_id}/video/{video["aweme_id"]}', yield self.url_result(webpage_url, TikTokIE, **info)
}
old_cursor = cursor
cursor = traverse_obj(
response, ('itemList', -1, 'createTime', {lambda x: x * 1E3}, {int_or_none}))
if not cursor:
cursor = old_cursor - 604800000 # jump 1 week back in time
if cursor < 1472706000000 or not traverse_obj(response, 'hasMorePrevious'):
break
def _get_sec_uid(self, user_url, user_name, msg):
webpage = self._download_webpage(
user_url, user_name, fatal=False, headers={'User-Agent': 'Mozilla/5.0'},
note=f'Downloading {msg} webpage', errnote=f'Unable to download {msg} webpage') or ''
return traverse_obj(
self._get_universal_data(webpage, user_name),
('webapp.user-detail', 'userInfo', 'user', 'secUid', {str})) or traverse_obj(
self._get_sigi_state(webpage, user_name),
('LiveRoom', 'liveRoomUserInfo', 'user', 'secUid'),
('UserModule', 'users', ..., 'secUid'),
get_all=False, expected_type=str)
def _real_extract(self, url): def _real_extract(self, url):
user_name = self._match_id(url) user_name, sec_uid = None, None
webpage = self._download_webpage(url, user_name, headers={ if url.startswith('tiktokuser:'):
'User-Agent': 'facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)' sec_uid = self._match_id(url)
}) else:
user_id = self._html_search_regex(r'snssdk\d*://user/profile/(\d+)', webpage, 'user ID', default=None) or user_name user_name = self._match_id(url)
videos = LazyList(self._video_entries_api(webpage, user_id, user_name)) if not sec_uid:
thumbnail = traverse_obj(videos, (0, 'author', 'avatar_larger', 'url_list', 0)) for user_url, msg in (
(self._UPLOADER_URL_FORMAT % user_name, 'user'),
(self._UPLOADER_URL_FORMAT % f'{user_name}/live', 'live'),
):
sec_uid = self._get_sec_uid(user_url, user_name, msg)
if sec_uid:
break
return self.playlist_result(self._entries_api(user_id, videos), user_id, user_name, thumbnail=thumbnail) if not sec_uid:
webpage = self._download_webpage(
f'https://www.tiktok.com/embed/@{user_name}', user_name,
note='Downloading user embed page', fatal=False) or ''
data = traverse_obj(self._search_json(
r'<script[^>]+\bid=[\'"]__FRONTITY_CONNECT_STATE__[\'"][^>]*>',
webpage, 'data', user_name, default={}),
('source', 'data', f'/embed/@{user_name}', {dict}))
for aweme_id in traverse_obj(data, ('videoList', ..., 'id')):
try:
sec_uid = self._extract_aweme_app(aweme_id).get('channel_id')
except ExtractorError:
continue
if sec_uid:
break
if not sec_uid:
raise ExtractorError(
'Unable to extract secondary user ID. Try using "tiktokuser:CHANNEL_ID" as the '
'input URL, replacing "CHANNEL_ID" with the channel_id of the requested user')
return self.playlist_result(self._entries(sec_uid, user_name), sec_uid, user_name)
class TikTokBaseListIE(TikTokBaseIE): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor class TikTokBaseListIE(TikTokBaseIE): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor

Wyświetl plik

@ -147,7 +147,7 @@ class WrestleUniverseBaseIE(InfoExtractor):
metadata = self._call_api(video_id, msg='metadata', query={'al': lang or 'ja'}, auth=False, fatal=False) metadata = self._call_api(video_id, msg='metadata', query={'al': lang or 'ja'}, auth=False, fatal=False)
if not metadata: if not metadata:
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
nextjs_data = self._search_nextjs_data(webpage, video_id) nextjs_data = self._search_nextjs_data(webpage, video_id, fatal=False)
metadata = traverse_obj(nextjs_data, ( metadata = traverse_obj(nextjs_data, (
'props', 'pageProps', *variadic(props_keys, (str, bytes, dict, set)), {dict})) or {} 'props', 'pageProps', *variadic(props_keys, (str, bytes, dict, set)), {dict})) or {}
return metadata return metadata