From 401e28b318c3c38d5a9022f356972d142d585f84 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Tue, 16 Apr 2024 16:35:51 -0700 Subject: [PATCH 1/8] BBC Issue 9701: NEXT_DATA field video extraction for bbc Some bbc articles with embedded video have the data for them within a json structure tagged with NEXT_DATA. Add a parser for this case. Links tested: https://www.bbc.com/news/uk-68546268 https://www.bbc.com/news/world-middle-east-68778149 https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness --- yt_dlp/extractor/bbc.py | 52 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 015af9e1d..a0108fec5 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -602,7 +602,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade', + 'title': 'Russia stages massive WW2 parade despite Western boycott', 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', }, 'playlist_count': 2, @@ -791,6 +791,17 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'timestamp': 1638230731, 'upload_date': '20211130', }, + }, { + # video with script id __NEXT_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/uk-68546268', + 'info_dict': { + 'id': 'p0hj0lq7', + 'ext': 'mp4', + 'title': 'Nasser Hospital doctor describes his treatment by IDF', + 'description': 'Doctor Abu Sabha said he was detained by Israeli forces after the raid on Nasser Hospital and feared for his life.\n\nThe IDF said "during the activity, about 200 terrorists and suspects of terrorist activity were detained, including some who posed as medical teams, many weapons were found, as well as closed medicines intended for Israeli hostages."', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1710270205000, + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', @@ -1255,6 +1266,45 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE lambda s: self._parse_json(s, playlist_id, fatal=False), re.findall(pattern, webpage)))) + # US accessed article with single embedded video (e.g. + # https://www.bbc.com/news/uk-68546268) + video_id = self._match_id(url) + next_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['page'] + video_data = None + timestamp = None + for key in next_data: + for item in (try_get(next_data, lambda x: x[key]['contents'], list) or []): + if item.get('type') == 'video': + video_data = item + elif item.get('type') == 'timestamp': + timestamp = item + if video_data: + for item in (try_get(video_data, lambda x: x['model']['blocks'], list) or []): + if item.get('type') == 'media': + for subtype in (try_get(item, lambda x: x['model']['blocks'], list) or []): + if subtype.get('type') == 'mediaMetadata': + model = subtype.get('model') + if model: + item_id = try_get(model, lambda x: x['versions'][0]['versionId']) + item_thumbnail = model.get('imageUrl') + item_title = model.get('title') + formats, subtitles = self._download_media_selector(item_id) + synopses = model.get('synopses') or {} + item_time = None + if timestamp: + item_time = try_get(timestamp, lambda x: x['model']['timestamp']) + entries.append({ + 'id': item_id, + 'title': item_title, + 'thumbnail': item_thumbnail, + 'formats': formats, + 'subtitles': subtitles, + 'timestamp': item_time, + 'description': dict_get(synopses, ('long', 'medium', 'short')) + }) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX From 5f35e175726469477e371996d050bfe7b2c68798 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Thu, 18 Apr 2024 09:12:36 -0700 Subject: [PATCH 2/8] Using traverse_obj --- yt_dlp/extractor/bbc.py | 57 +++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index a0108fec5..1bc7a69b7 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1268,40 +1268,31 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE # US accessed article with single embedded video (e.g. # https://www.bbc.com/news/uk-68546268) - video_id = self._match_id(url) - next_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['page'] - video_data = None - timestamp = None - for key in next_data: - for item in (try_get(next_data, lambda x: x[key]['contents'], list) or []): - if item.get('type') == 'video': - video_data = item - elif item.get('type') == 'timestamp': - timestamp = item + next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id), ( + 'props', 'pageProps', 'page'), get_all=False) + video_data = traverse_obj(next_data, ( + ..., 'contents', lambda _, v: v['type'] == 'video'), get_all=False) if video_data: - for item in (try_get(video_data, lambda x: x['model']['blocks'], list) or []): - if item.get('type') == 'media': - for subtype in (try_get(item, lambda x: x['model']['blocks'], list) or []): - if subtype.get('type') == 'mediaMetadata': - model = subtype.get('model') - if model: - item_id = try_get(model, lambda x: x['versions'][0]['versionId']) - item_thumbnail = model.get('imageUrl') - item_title = model.get('title') - formats, subtitles = self._download_media_selector(item_id) - synopses = model.get('synopses') or {} - item_time = None - if timestamp: - item_time = try_get(timestamp, lambda x: x['model']['timestamp']) - entries.append({ - 'id': item_id, - 'title': item_title, - 'thumbnail': item_thumbnail, - 'formats': formats, - 'subtitles': subtitles, - 'timestamp': item_time, - 'description': dict_get(synopses, ('long', 'medium', 'short')) - }) + timestamp = traverse_obj(next_data, ( + ..., 'contents', lambda _, v: v['type'] == 'timestamp', + 'model', 'timestamp', {int_or_none}), get_all=False) + model = traverse_obj(video_data, ( + 'model', 'blocks', lambda _, v: v['type'] == 'media', + 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', + 'model'), get_all=False) + if model: + item_id = try_get(model, lambda x: x['versions'][0]['versionId']) + formats, subtitles = self._download_media_selector(item_id) + synopses = model.get('synopses') or {} + entries.append({ + 'id': item_id, + 'title': model.get('title'), + 'thumbnail': model.get('imageUrl'), + 'formats': formats, + 'subtitles': subtitles, + 'timestamp': timestamp, + 'description': dict_get(synopses, ('long', 'medium', 'short')) + }) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) From 89eaee2ff83de8bcd44472d39e89110fec8acf08 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Fri, 19 Apr 2024 10:04:05 -0700 Subject: [PATCH 3/8] one more tranverse --- yt_dlp/extractor/bbc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 1bc7a69b7..be36bbb63 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1281,7 +1281,8 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', 'model'), get_all=False) if model: - item_id = try_get(model, lambda x: x['versions'][0]['versionId']) + item_id = traverse_obj(model, ( + 'versions', 0, 'versionId'), get_all=False) formats, subtitles = self._download_media_selector(item_id) synopses = model.get('synopses') or {} entries.append({ From b9af6bf2ce8d5d8a5841dc84f0ed63d762c50e36 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Fri, 19 Apr 2024 10:06:59 -0700 Subject: [PATCH 4/8] nit, style --- yt_dlp/extractor/bbc.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index be36bbb63..9419f1ce1 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1281,8 +1281,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', 'model'), get_all=False) if model: - item_id = traverse_obj(model, ( - 'versions', 0, 'versionId'), get_all=False) + item_id = traverse_obj(model, ('versions', 0, 'versionId')) formats, subtitles = self._download_media_selector(item_id) synopses = model.get('synopses') or {} entries.append({ From 9dbd9fc8734c76e0561bfd3de4038a0fef521491 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Fri, 19 Apr 2024 10:50:22 -0700 Subject: [PATCH 5/8] more streamlining --- yt_dlp/extractor/bbc.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 9419f1ce1..f882c56b2 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1273,25 +1273,27 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE video_data = traverse_obj(next_data, ( ..., 'contents', lambda _, v: v['type'] == 'video'), get_all=False) if video_data: - timestamp = traverse_obj(next_data, ( - ..., 'contents', lambda _, v: v['type'] == 'timestamp', - 'model', 'timestamp', {int_or_none}), get_all=False) model = traverse_obj(video_data, ( 'model', 'blocks', lambda _, v: v['type'] == 'media', 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', 'model'), get_all=False) if model: - item_id = traverse_obj(model, ('versions', 0, 'versionId')) + timestamp = traverse_obj(next_data, ( + ..., 'contents', lambda _, v: v['type'] == 'timestamp', + 'model', 'timestamp', {int_or_none}, any)) + item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) formats, subtitles = self._download_media_selector(item_id) - synopses = model.get('synopses') or {} entries.append({ 'id': item_id, - 'title': model.get('title'), - 'thumbnail': model.get('imageUrl'), 'formats': formats, 'subtitles': subtitles, 'timestamp': timestamp, - 'description': dict_get(synopses, ('long', 'medium', 'short')) + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {url_or_none}), + 'description': ( + 'synopses', ('long', 'medium', 'short'), {str}, any), + }) }) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) From e2ae76e84cba1e603f792ea7a0db9903c7bfce57 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Sun, 21 Apr 2024 16:22:46 -0700 Subject: [PATCH 6/8] Making the parse_model function, address comments --- yt_dlp/extractor/bbc.py | 74 ++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index f882c56b2..46fa7b7de 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -798,9 +798,11 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'id': 'p0hj0lq7', 'ext': 'mp4', 'title': 'Nasser Hospital doctor describes his treatment by IDF', - 'description': 'Doctor Abu Sabha said he was detained by Israeli forces after the raid on Nasser Hospital and feared for his life.\n\nThe IDF said "during the activity, about 200 terrorists and suspects of terrorist activity were detained, including some who posed as medical teams, many weapons were found, as well as closed medicines intended for Israeli hostages."', + 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276,} hostages\."$', 'thumbnail': r're:https?://.+/.+\.jpg', - 'timestamp': 1710270205000, + 'timestamp': 1710188248, + 'upload_date': '20240311', + 'duration': 104, }, }, { # single video article embedded with data-media-vpid @@ -1266,37 +1268,47 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE lambda s: self._parse_json(s, playlist_id, fatal=False), re.findall(pattern, webpage)))) + def parse_model(model): + '''Extract single video from model structure''' + if(type(model) == list): + model = model[0] + item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) + if not item_id: + return + formats, subtitles = self._download_media_selector(item_id) + return { + 'id': item_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ( + 'synopses', ('long', 'medium', 'short'), {str}, any), + 'duration': ('versions', 0, 'duration', {int}), + 'timestamp': ('versions', 0, 'availableFrom', {lambda x: int_or_none(x, scale=1000)}), + }) + } + # US accessed article with single embedded video (e.g. # https://www.bbc.com/news/uk-68546268) - next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id), ( - 'props', 'pageProps', 'page'), get_all=False) - video_data = traverse_obj(next_data, ( - ..., 'contents', lambda _, v: v['type'] == 'video'), get_all=False) - if video_data: - model = traverse_obj(video_data, ( - 'model', 'blocks', lambda _, v: v['type'] == 'media', - 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', - 'model'), get_all=False) - if model: - timestamp = traverse_obj(next_data, ( - ..., 'contents', lambda _, v: v['type'] == 'timestamp', - 'model', 'timestamp', {int_or_none}, any)) - item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) - formats, subtitles = self._download_media_selector(item_id) - entries.append({ - 'id': item_id, - 'formats': formats, - 'subtitles': subtitles, - 'timestamp': timestamp, - **traverse_obj(model, { - 'title': ('title', {str}), - 'thumbnail': ('imageUrl', {url_or_none}), - 'description': ( - 'synopses', ('long', 'medium', 'short'), {str}, any), - }) - }) - return self.playlist_result( - entries, playlist_id, playlist_title, playlist_description) + next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), ( + 'props', 'pageProps', 'page')) + model = traverse_obj(next_data, ( + ..., 'contents', lambda _, v: v['type'] == 'video', + 'model', 'blocks', lambda _, v: v['type'] == 'media', + 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', + 'model')) + if model: + entry = parse_model(model) + if entry: + if entry.get('timestamp') is None: + entry['timestamp'] = traverse_obj(next_data, ( + ..., 'contents', lambda _, v: v['type'] == 'timestamp', + 'model', 'timestamp', {functools.partial(int_or_none, scale=1000)}, any)) + entries.append(entry) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) From 4b9a54b464bd9ddc57170f27a8ffd4ac6a987bd3 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Sun, 21 Apr 2024 16:41:19 -0700 Subject: [PATCH 7/8] flake8 check --- yt_dlp/extractor/bbc.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 46fa7b7de..50ccf922f 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -1270,7 +1270,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE def parse_model(model): '''Extract single video from model structure''' - if(type(model) == list): + if isinstance(model, list): model = model[0] item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) if not item_id: @@ -1285,10 +1285,10 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), 'description': ( 'synopses', ('long', 'medium', 'short'), {str}, any), - 'duration': ('versions', 0, 'duration', {int}), - 'timestamp': ('versions', 0, 'availableFrom', {lambda x: int_or_none(x, scale=1000)}), - }) - } + 'duration': ('versions', 0, 'duration', {int}), + 'timestamp': ('versions', 0, 'availableFrom', {lambda x: int_or_none(x, scale=1000)}), + }) + } # US accessed article with single embedded video (e.g. # https://www.bbc.com/news/uk-68546268) From 6ef899032037351f10a92145f931962c058e2300 Mon Sep 17 00:00:00 2001 From: Kyle Gonsalves Date: Sun, 21 Apr 2024 18:17:01 -0700 Subject: [PATCH 8/8] different solution for traversal issues --- yt_dlp/extractor/bbc.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 50ccf922f..bf867394b 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -798,7 +798,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'id': 'p0hj0lq7', 'ext': 'mp4', 'title': 'Nasser Hospital doctor describes his treatment by IDF', - 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276,} hostages\."$', + 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$', 'thumbnail': r're:https?://.+/.+\.jpg', 'timestamp': 1710188248, 'upload_date': '20240311', @@ -1270,8 +1270,6 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE def parse_model(model): '''Extract single video from model structure''' - if isinstance(model, list): - model = model[0] item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) if not item_id: return @@ -1298,7 +1296,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE ..., 'contents', lambda _, v: v['type'] == 'video', 'model', 'blocks', lambda _, v: v['type'] == 'media', 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', - 'model')) + 'model', {dict}, any)) if model: entry = parse_model(model) if entry: