From da07bbeb12850e3c40914c3f5359eca675ea8d11 Mon Sep 17 00:00:00 2001 From: Tahasanul Abraham Date: Tue, 14 Nov 2023 18:32:24 +0100 Subject: [PATCH] [extractor:update] Support profiles, searches, channels, quickies and favourites --- yt_dlp/extractor/_extractors.py | 5 +- yt_dlp/extractor/xvideos.py | 264 +++++++++++++++++++++++++++++--- 2 files changed, 250 insertions(+), 19 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 8b036bb69..2f918f049 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2478,7 +2478,10 @@ from .xtube import XTubeUserIE, XTubeIE from .xuite import XuiteIE from .xvideos import ( XVideosIE, - XVideosQuickiesIE + XVideosChannelIE, + XVideosPlaylistIE, + XVideosRelatedIE, + XVideosSearchIE, ) from .xxxymovies import XXXYMoviesIE from .yahoo import ( diff --git a/yt_dlp/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py index 5df071503..1f824fc72 100644 --- a/yt_dlp/extractor/xvideos.py +++ b/yt_dlp/extractor/xvideos.py @@ -1,13 +1,24 @@ import re +import itertools from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urlparse, + compat_urllib_parse_unquote, + compat_urllib_parse_urlencode, +) from ..utils import ( clean_html, determine_ext, + extract_attributes, ExtractorError, int_or_none, parse_duration, + try_get, + url_basename, + urljoin, ) @@ -159,22 +170,239 @@ class XVideosIE(InfoExtractor): } -class XVideosQuickiesIE(InfoExtractor): - IE_NAME = 'xvideos:quickies' - _VALID_URL = r'https?://(?P(?:[^/]+\.)?xvideos2?\.com)/amateur-channels/[^#]+#quickies/a/(?P\d+)' - _TESTS = [{ - 'url': 'https://www.xvideos.com/amateur-channels/wifeluna#quickies/a/47258683', - 'md5': '16e322a93282667f1963915568f782c1', - 'info_dict': { - 'id': '47258683', - 'ext': 'mp4', - 'title': 'Verification video', - 'age_limit': 18, - 'duration': 16, - 'thumbnail': r're:^https://cdn.*-pic.xvideos-cdn.com/.+\.jpg', - } - }] +class XVideosPlaylistIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?xvideos2?\.com/ + (?:c(?:/[sm]:[^/]+)*| + profiles| + favorite)/ + (?P[^#?/]+) + ''' + _TESTS = [] + + def _extract_videos_from_json_list(self, json_list, path='video'): + return ( + 'https://www.xvideos.com/%s%d/%s' % (path, x.get('id'), str(x.get('u')).split('/')[-1]) + for x in json_list if isinstance(x, dict)) + + def _get_playlist_url(self, url, playlist_id): + """URL of first playlist page""" + id_match = re.match(self._VALID_URL, url).groupdict() + video_sort = id_match.get('sort') + if video_sort: + url, _ = compat_urlparse.urldefrag(url) + if url.endswith('/'): + url = url[:-1] + url = '%s/%s' % (url, video_sort.replace('-', '/')) + return url + + def _get_next_page(self, url, num, page): + '''URL of num'th continuation page of url''' + if page.startswith('{'): + url, sub = re.subn(r'(/)(\d{1,7})($|[#?/])', r'\g<1>%d\3' % (num, ), url) + if sub == 0: + url += '/%d' % (num, ) + return url + next_page = self._search_regex( + r'''(?s)(]*?\bclass\s*=\s*(?P'|").*?\bnext-page\b.*?(?P=q)[^>]*?>)''', + page, 'next page', default=None) + if next_page: + next_page = extract_attributes(next_page) + next_page = next_page.get('href') + if next_page: + return urljoin(url, next_page) + return False + + def _extract_videos(self, url, playlist_id, num, page): + """Get iterable videos plus stop flag""" + return (( + 'https://www.xvideos.com/video' + x.group('video_id') + for x in re.finditer(r'''class\s*=\s*"title"\s*>\s*<\s*a\s*href\s*=\s*(\'|")\/video(?P(.*?))\1''', page)), + None) def _real_extract(self, url): - domain, id_ = self._match_valid_url(url).group('domain', 'id') - return self.url_result(f'https://{domain}/video{id_}/_', XVideosIE, id_) + id_match = re.match(self._VALID_URL, url).groupdict() + playlist_id = id_match['id'] + if "video" in playlist_id and url.endswith(playlist_id): + url += '/0' + + next_page = self._get_playlist_url(url, playlist_id) + + if id_match['quickiesid']: + return self.url_result(next_page, XVideosIE) + + matches = [] + for count in itertools.count(0): + webpage = self._download_webpage( + next_page, + '%s (+%d)' % (playlist_id, count) if count > 0 else playlist_id) + + vids, stop = self._extract_videos(next_page, playlist_id, count, webpage) + + if vids: + matches.append(vids) + + if stop: + break + next_page = self._get_next_page(next_page, count + 1, webpage) + if not next_page: + break + + return self.playlist_from_matches( + itertools.chain.from_iterable(matches), playlist_id) + + +class XVideosRelatedIE(XVideosPlaylistIE): + _VALID_URL = XVideosIE._VALID_URL + r'(?:/[^/]+)*?\#_related-(?Pvideos|playlists)' + + _TESTS = [] + + def _extract_videos(self, url, playlist_id, num, page): + id_match = re.match(self._VALID_URL, url).groupdict() + related = id_match.get('related') + if not related: + return super(XVideosRelatedIE, self)._extract_videos(url, playlist_id, num, page) + + if related == 'videos': + related_json = self._search_regex( + r'(?s)videos_related\s*=\s*(\[.*?])\s*;', + page, 'related', default='[]') + related_json = self._parse_json(related_json, playlist_id, fatal=False) or [] + return (self._extract_videos_from_json_list(related_json), True) + # playlists + related_json = self._download_json( + 'https://www.xvideos.com/video-playlists/' + playlist_id, playlist_id, fatal=False) + return ( + self._extract_videos_from_json_list( + try_get(related_json, lambda x: x['playlists'], list) or [], + path='favorite/'), + True) + + +class XVideosChannelIE(XVideosPlaylistIE): + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?xvideos2?\.com/ + (?: + (?:amateur-|pornstar-|model-)?channel|profile| + pornstar|model|amateur + )s/ + (?P[^#?/]+) + (?:\#_tab(?PVideos|Favorites|Playlists|AboutMe)(?:,(?P[^,]+))?)? + (?:\#quickies/a/(?P.*))? + ''' + _TESTS = [{ + 'url': 'https://www.xvideos.com/pornstar-channels/sienna-west', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos.com/pornstars/silvia-jons#_tabVideos', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos.com/channels/miss_floyd#_tabVideos', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos.com/models/migurt-1', + 'playlist_mincount': 5, + }, { + 'url': 'https://www.xvideos.com/amateur-channels/wifeluna#quickies/a/47258683', + 'playlist_mincount': 5, + }, ] + + def _get_playlist_url(self, url, playlist_id): + webpage = self._download_webpage(url, playlist_id) + id_match = re.match(self._VALID_URL, url).groupdict() + tab = (id_match.get('tab') or '').lower() + quickiesid = (id_match.get('quickiesid') or '').lower() + + if not tab and not quickiesid: + url += '#_tabVideos' + + if tab and not quickiesid: + if tab in ('videos', 'favorites'): + url, frag = compat_urlparse.urldefrag(url) + if not url.endswith('/'): + url += '/' + frag = frag.split(',') + url += tab + if tab == 'videos': + url += '/' + (frag[1] if len(frag) > 1 else 'best') + url += '/0' + return url + + elif quickiesid: + url = f'https://www.xvideos.com/video{quickiesid}/_' + return url + + # activity + conf = self._search_regex( + r'(?s)\.\s*xv\s*\.\s*conf\s*=\s*(\{.*?})[\s;]* 0) + + if tab == 'favorites': + return (( + 'https://www.xvideos.com' + x.group('playlist') + for x in re.finditer(r''']*?href\s*=\s*('|")(?P/favorite/\d+/[^#?]+?)\1''', page)), + None) + + return super(XVideosChannelIE, self)._extract_videos(url, playlist_id, num, page) + + +class XVideosSearchIE(XVideosPlaylistIE): + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?xvideos2?\.com/ + \?k=(?P[^#?/&]+) + ''' + _TESTS = [{ + # uninteresting search with probably at least two pages of results, + # but not too many more + 'url': 'http://www.xvideos.com/?k=libya&sort=length', + 'playlist_mincount': 30, + }, ] + + def _get_next_page(self, url, num, page): + parsed_url = compat_urlparse.urlparse(url) + qs = compat_parse_qs(parsed_url.query) + qs['p'] = [num] + parsed_url = ( + list(parsed_url[:4]) + + [compat_urllib_parse_urlencode(qs, True), None]) + return compat_urlparse.urlunparse(parsed_url), False