diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 015af9e1d..bf867394b 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -602,7 +602,7 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade', + 'title': 'Russia stages massive WW2 parade despite Western boycott', 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', }, 'playlist_count': 2, @@ -791,6 +791,19 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'timestamp': 1638230731, 'upload_date': '20211130', }, + }, { + # video with script id __NEXT_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/uk-68546268', + 'info_dict': { + 'id': 'p0hj0lq7', + 'ext': 'mp4', + 'title': 'Nasser Hospital doctor describes his treatment by IDF', + 'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1710188248, + 'upload_date': '20240311', + 'duration': 104, + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', @@ -1255,6 +1268,46 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE lambda s: self._parse_json(s, playlist_id, fatal=False), re.findall(pattern, webpage)))) + def parse_model(model): + '''Extract single video from model structure''' + item_id = traverse_obj(model, ('versions', 0, 'versionId', {str})) + if not item_id: + return + formats, subtitles = self._download_media_selector(item_id) + return { + 'id': item_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(model, { + 'title': ('title', {str}), + 'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}), + 'description': ( + 'synopses', ('long', 'medium', 'short'), {str}, any), + 'duration': ('versions', 0, 'duration', {int}), + 'timestamp': ('versions', 0, 'availableFrom', {lambda x: int_or_none(x, scale=1000)}), + }) + } + + # US accessed article with single embedded video (e.g. + # https://www.bbc.com/news/uk-68546268) + next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), ( + 'props', 'pageProps', 'page')) + model = traverse_obj(next_data, ( + ..., 'contents', lambda _, v: v['type'] == 'video', + 'model', 'blocks', lambda _, v: v['type'] == 'media', + 'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata', + 'model', {dict}, any)) + if model: + entry = parse_model(model) + if entry: + if entry.get('timestamp') is None: + entry['timestamp'] = traverse_obj(next_data, ( + ..., 'contents', lambda _, v: v['type'] == 'timestamp', + 'model', 'timestamp', {functools.partial(int_or_none, scale=1000)}, any)) + entries.append(entry) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX