From f0bc6e2019a2f81d358ebddc4ae4cf8e9e4ed905 Mon Sep 17 00:00:00 2001 From: coletdev Date: Sun, 19 Jun 2022 00:55:18 +0000 Subject: [PATCH] [extractor] Add `default` parameter to `_search_json` (#4057) Authored by: pukkandan, coletdjnz --- yt_dlp/extractor/archiveorg.py | 4 ++-- yt_dlp/extractor/common.py | 29 ++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index c1c9b0adf5..179602d466 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -486,9 +486,9 @@ class YoutubeWebArchiveIE(InfoExtractor): search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None)) player_response = self._search_json( self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', - video_id, fatal=False) + video_id, default={}) initial_data = self._search_json( - self._YT_INITIAL_DATA_RE, webpage, 'initial data', video_id, fatal=False) + self._YT_INITIAL_DATA_RE, webpage, 'initial data', video_id, default={}) initial_data_video = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'), diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 601394b416..093a9b5cd1 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1188,13 +1188,32 @@ class InfoExtractor: self.report_warning('unable to extract %s' % _name + bug_reports_message()) return None - def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', contains_pattern='(?s:.+)', fatal=True, **kwargs): + def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', + contains_pattern='(?s:.+)', fatal=True, default=NO_DEFAULT, **kwargs): """Searches string for the JSON object specified by start_pattern""" # NB: end_pattern is only used to reduce the size of the initial match - return self._parse_json( - self._search_regex(rf'{start_pattern}\s*(?P{{{contains_pattern}}})\s*{end_pattern}', - string, name, group='json', fatal=fatal) or '{}', - video_id, fatal=fatal, ignore_extra=True, **kwargs) or {} + if default is NO_DEFAULT: + default, has_default = {}, False + else: + fatal, has_default = False, True + + json_string = self._search_regex( + rf'{start_pattern}\s*(?P{{\s*{contains_pattern}\s*}})\s*{end_pattern}', + string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT) + if not json_string: + return default + + _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS) + try: + return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs) + except ExtractorError as e: + if fatal: + raise ExtractorError( + f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id) + elif not has_default: + self.report_warning( + f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id) + return default def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """