Merge 6ef8990320 into ff38a011d5

2024-04-22 01:18:10 +00:00 · 2024-04-22 01:18:10 +00:00 · aac430b2fb
commit aac430b2fb
--- a/yt_dlp/extractor/bbc.py
+++ b/yt_dlp/extractor/bbc.py
@ -602,7 +602,7 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
        'url': 'http://www.bbc.com/news/world-europe-32668511',
        'info_dict': {
            'id': 'world-europe-32668511',
-            'title': 'Russia stages massive WW2 parade',
+            'title': 'Russia stages massive WW2 parade despite Western boycott',
            'description': 'md5:00ff61976f6081841f759a08bf78cc9c',
        },
        'playlist_count': 2,
@ -791,6 +791,19 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
            'timestamp': 1638230731,
            'upload_date': '20211130',
        },
+    }, {
+        # video with script id __NEXT_DATA__ and value as JSON string
+        'url': 'https://www.bbc.com/news/uk-68546268',
+        'info_dict': {
+            'id': 'p0hj0lq7',
+            'ext': 'mp4',
+            'title': 'Nasser Hospital doctor describes his treatment by IDF',
+            'description': r're:(?s)Doctor Abu Sabha said he was detained by Israeli forces after .{276} hostages\."$',
+            'thumbnail': r're:https?://.+/.+\.jpg',
+            'timestamp': 1710188248,
+            'upload_date': '20240311',
+            'duration': 104,
+        },
    }, {
        # single video article embedded with data-media-vpid
        'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
@ -1255,6 +1268,46 @@ class BBCIE(BBCCoUkIE):  # XXX: Do not subclass from concrete IE
                lambda s: self._parse_json(s, playlist_id, fatal=False),
                re.findall(pattern, webpage))))

+        def parse_model(model):
+            '''Extract single video from model structure'''
+            item_id = traverse_obj(model, ('versions', 0, 'versionId', {str}))
+            if not item_id:
+                return
+            formats, subtitles = self._download_media_selector(item_id)
+            return {
+                'id': item_id,
+                'formats': formats,
+                'subtitles': subtitles,
+                **traverse_obj(model, {
+                    'title': ('title', {str}),
+                    'thumbnail': ('imageUrl', {lambda u: urljoin(url, u.replace('$recipe', 'raw'))}),
+                    'description': (
+                        'synopses', ('long', 'medium', 'short'), {str}, any),
+                    'duration': ('versions', 0, 'duration', {int}),
+                    'timestamp': ('versions', 0, 'availableFrom', {lambda x: int_or_none(x, scale=1000)}),
+                })
+            }
+
+        # US accessed article with single embedded video (e.g.
+        # https://www.bbc.com/news/uk-68546268)
+        next_data = traverse_obj(self._search_nextjs_data(webpage, playlist_id, default={}), (
+            'props', 'pageProps', 'page'))
+        model = traverse_obj(next_data, (
+            ..., 'contents', lambda _, v: v['type'] == 'video',
+            'model', 'blocks', lambda _, v: v['type'] == 'media',
+            'model', 'blocks', lambda _, v: v['type'] == 'mediaMetadata',
+            'model', {dict}, any))
+        if model:
+            entry = parse_model(model)
+            if entry:
+                if entry.get('timestamp') is None:
+                    entry['timestamp'] = traverse_obj(next_data, (
+                        ..., 'contents', lambda _, v: v['type'] == 'timestamp',
+                        'model', 'timestamp', {functools.partial(int_or_none, scale=1000)}, any))
+                entries.append(entry)
+                return self.playlist_result(
+                    entries, playlist_id, playlist_title, playlist_description)
+
        # Multiple video article (e.g.
        # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460)
        EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+%s(?:\b[^"]+)?' % self._ID_REGEX