[extractor/substack] Add extractor (#4011)

Closes #3722
Authored by: elyse0
pull/4118/head
Elyse 2022-06-18 19:08:53 -05:00 zatwierdzone przez GitHub
rodzic 7a2e40dd48
commit 612e31f5ea
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
3 zmienionych plików z 135 dodań i 1 usunięć

Wyświetl plik

@ -1640,6 +1640,7 @@ from .streetvoice import StreetVoiceIE
from .stretchinternet import StretchInternetIE
from .stripchat import StripchatIE
from .stv import STVPlayerIE
from .substack import SubstackIE
from .sunporno import SunPornoIE
from .sverigesradio import (
SverigesRadioEpisodeIE,

Wyświetl plik

@ -69,6 +69,7 @@ from .spankwire import SpankwireIE
from .sportbox import SportBoxIE
from .spotify import SpotifyBaseIE
from .springboardplatform import SpringboardPlatformIE
from .substack import SubstackIE
from .svt import SVTIE
from .teachable import TeachableIE
from .ted import TedEmbedIE
@ -2542,7 +2543,34 @@ class GenericIE(InfoExtractor):
'timestamp': 1652833414,
'age_limit': 0,
}
}, {
},
{
'url': 'https://www.mollymovieclub.com/p/interstellar?s=r#details',
'md5': '198bde8bed23d0b23c70725c83c9b6d9',
'info_dict': {
'id': '53602801',
'ext': 'mpga',
'title': 'Interstellar',
'description': 'Listen now | Episode One',
'thumbnail': 'md5:c30d9c83f738e16d8551d7219d321538',
'uploader': 'Molly Movie Club',
'uploader_id': '839621',
},
},
{
'url': 'https://www.blockedandreported.org/p/episode-117-lets-talk-about-depp?s=r',
'md5': 'c0cc44ee7415daeed13c26e5b56d6aa0',
'info_dict': {
'id': '57962052',
'ext': 'mpga',
'title': 'md5:855b2756f0ee10f6723fa00b16266f8d',
'description': 'md5:fe512a5e94136ad260c80bde00ea4eef',
'thumbnail': 'md5:2218f27dfe517bb5ac16c47d0aebac59',
'uploader': 'Blocked and Reported',
'uploader_id': '500230',
},
},
{
'url': 'https://www.skimag.com/video/ski-people-1980/',
'info_dict': {
'id': 'ski-people-1980',
@ -3107,6 +3135,11 @@ class GenericIE(InfoExtractor):
# Don't set the extractor because it can be a track url or an album
return self.url_result(burl)
# Check for Substack custom domains
substack_url = SubstackIE._extract_url(webpage, url)
if substack_url:
return self.url_result(substack_url, SubstackIE)
# Look for embedded Vevo player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)

Wyświetl plik

@ -0,0 +1,100 @@
import re
import urllib.parse
from .common import InfoExtractor
from ..utils import str_or_none, traverse_obj
class SubstackIE(InfoExtractor):
_VALID_URL = r'https?://(?P<username>[\w-]+)\.substack\.com/p/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://haleynahman.substack.com/p/i-made-a-vlog?s=r',
'md5': 'f27e4fc6252001d48d479f45e65cdfd5',
'info_dict': {
'id': '47660949',
'ext': 'mp4',
'title': 'I MADE A VLOG',
'description': 'md5:10c01ff93439a62e70ce963b2aa0b7f6',
'thumbnail': 'md5:bec758a34d8ee9142d43bcebdf33af18',
'uploader': 'Maybe Baby',
'uploader_id': '33628',
}
}, {
'url': 'https://haleynahman.substack.com/p/-dear-danny-i-found-my-boyfriends?s=r',
'md5': '0a63eacec877a1171a62cfa69710fcea',
'info_dict': {
'id': '51045592',
'ext': 'mpga',
'title': "🎧 Dear Danny: I found my boyfriend's secret Twitter account",
'description': 'md5:a57f2439319e56e0af92dd0c95d75797',
'thumbnail': 'md5:daa40b6b79249417c14ff8103db29639',
'uploader': 'Maybe Baby',
'uploader_id': '33628',
}
}, {
'url': 'https://andrewzimmern.substack.com/p/mussels-with-black-bean-sauce-recipe',
'md5': 'fd3c07077b02444ff0130715b5f632bb',
'info_dict': {
'id': '47368578',
'ext': 'mp4',
'title': 'Mussels with Black Bean Sauce: Recipe of the Week #7',
'description': 'md5:b96234a2906c7d854d5229818d889515',
'thumbnail': 'md5:e30bfaa9da40e82aa62354263a9dd232',
'uploader': "Andrew Zimmern's Spilled Milk ",
'uploader_id': '577659',
}
}]
@classmethod
def _extract_url(cls, webpage, url):
if not re.search(r'<script[^>]+src=["\']https://substackcdn.com/[^"\']+\.js', webpage):
return
mobj = re.search(r'{[^}]*["\']subdomain["\']\s*:\s*["\'](?P<subdomain>[^"]+)', webpage)
if mobj:
parsed = urllib.parse.urlparse(url)
return parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl()
def _extract_video_formats(self, video_id, username):
formats, subtitles = [], {}
for video_format in ('hls', 'mp4'):
video_url = f'https://{username}.substack.com/api/v1/video/upload/{video_id}/src?type={video_format}'
if video_format == 'hls':
fmts, subs = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'url': video_url,
'ext': video_format,
})
return formats, subtitles
def _real_extract(self, url):
display_id, username = self._match_valid_url(url).group('id', 'username')
webpage = self._download_webpage(url, display_id)
webpage_info = self._search_json(r'<script[^>]*>\s*window\._preloads\s*=', webpage, 'preloads', display_id)
post_type = webpage_info['post']['type']
formats, subtitles = [], {}
if post_type == 'podcast':
formats, subtitles = [{'url': webpage_info['post']['podcast_url']}], {}
elif post_type == 'video':
formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], username)
else:
self.raise_no_formats(f'Page type "{post_type}" is not supported')
self._sort_formats(formats)
return {
'id': str(webpage_info['post']['id']),
'formats': formats,
'subtitles': subtitles,
'title': traverse_obj(webpage_info, ('post', 'title')),
'description': traverse_obj(webpage_info, ('post', 'description')),
'thumbnail': traverse_obj(webpage_info, ('post', 'cover_image')),
'uploader': traverse_obj(webpage_info, ('pub', 'name')),
'uploader_id': str_or_none(traverse_obj(webpage_info, ('post', 'publication_id'))),
}