From 0c36dc00d7b9f43238bacb0e03730f31117d0b38 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Thu, 23 Jun 2022 01:42:39 +0530 Subject: [PATCH] [extractor/npr] Implement e50c3500b43d80e4492569c4b4523c4379c6fbb2 differently Closes #4141 --- yt_dlp/extractor/common.py | 42 ++++++++++++++++++-------------------- yt_dlp/extractor/npr.py | 16 +++++---------- 2 files changed, 25 insertions(+), 33 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 90af41575..f4c34f43c 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1392,27 +1392,25 @@ class InfoExtractor: return self._html_search_meta('twitter:player', html, 'twitter card player') - def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): - json_ld_list = list(re.finditer(JSON_LD_RE, html)) - default = kwargs.get('default', NO_DEFAULT) - # JSON-LD may be malformed and thus `fatal` should be respected. - # At the same time `default` may be passed that assumes `fatal=False` - # for _search_regex. Let's simulate the same behavior here as well. - fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False - json_ld = [] - for mobj in json_ld_list: - json_ld_item = self._parse_json( - mobj.group('json_ld'), video_id, fatal=fatal) - if not json_ld_item: - continue - if isinstance(json_ld_item, dict): - json_ld.append(json_ld_item) - elif isinstance(json_ld_item, (list, tuple)): - json_ld.extend(json_ld_item) - if json_ld: - json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) - if json_ld: - return json_ld + def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT): + """Yield all json ld objects in the html""" + if default is not NO_DEFAULT: + fatal = False + for mobj in re.finditer(JSON_LD_RE, html): + json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal) + for json_ld in variadic(json_ld_item): + if isinstance(json_ld, dict): + yield json_ld + + def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT): + """Search for a video in any json ld in the html""" + if default is not NO_DEFAULT: + fatal = False + info = self._json_ld( + list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)), + video_id, fatal=fatal, expected_type=expected_type) + if info: + return info if default is not NO_DEFAULT: return default elif fatal: @@ -1500,7 +1498,7 @@ class InfoExtractor: assert is_type(e, 'VideoObject') author = e.get('author') info.update({ - 'url': traverse_obj(e, 'contentUrl', 'embedUrl', expected_type=url_or_none), + 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), 'thumbnails': [{'url': url} diff --git a/yt_dlp/extractor/npr.py b/yt_dlp/extractor/npr.py index e4ff8d6c2..e677e862d 100644 --- a/yt_dlp/extractor/npr.py +++ b/yt_dlp/extractor/npr.py @@ -1,9 +1,5 @@ from .common import InfoExtractor -from ..utils import ( - int_or_none, - qualities, - url_or_none, -) +from ..utils import int_or_none, qualities, traverse_obj, url_or_none class NprIE(InfoExtractor): @@ -74,10 +70,6 @@ class NprIE(InfoExtractor): })['list']['story'][0] playlist_title = story.get('title', {}).get('$text') - # Fetch the JSON-LD from the npr page. - json_ld = self._search_json_ld( - self._download_webpage(url, playlist_id), playlist_id, 'NewsArticle', fatal=False) - KNOWN_FORMATS = ('threegp', 'm3u8', 'smil', 'mp4', 'mp3') quality = qualities(KNOWN_FORMATS) @@ -124,8 +116,10 @@ class NprIE(InfoExtractor): stream_url, stream_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - if not formats and json_ld.get('url'): - formats.extend(self._extract_m3u8_formats(json_ld['url'], media_id, 'mp4', m3u8_id='hls', fatal=False)) + if not formats: + raw_json_ld = self._yield_json_ld(self._download_webpage(url, playlist_id), playlist_id, fatal=False) + m3u8_url = traverse_obj(list(raw_json_ld), (..., 'subjectOf', ..., 'embedUrl'), get_all=False) + formats = self._extract_m3u8_formats(m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False) self._sort_formats(formats)