From 7a74feda7837bc54b6b0ccfed980ca9bf7c8a906 Mon Sep 17 00:00:00 2001 From: xarantolus Date: Fri, 19 Jun 2020 14:57:57 +0200 Subject: [PATCH 01/33] [youtube] Fix extraction of search urls (closes ytdl-org/youtube-dl#25696) --- youtube_dl/extractor/youtube.py | 35 ++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1bc79e0147..1f16012b20 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3146,7 +3146,40 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P[^"]+))?' + _PLAYLIST_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + + def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): + playlist_json = self._search_regex(self._PLAYLIST_DATA, page, 'ytInitialData') + playlist_response = self._parse_json(playlist_json, None) + + result_items = try_get( + playlist_response, + lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) + + # plobj either contains a 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'shelfRenderer' or 'searchPyvRenderer' (promoted video/ad) + for plobj in result_items: + video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) + video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) + + if video_id is None or video_title is None: + # we do not have a videoRenderer or it is empty + continue + + video_title = video_title.strip() + + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) + return zip(ids_in_page, titles_in_page) class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): From 6dad89289cb2713065d8d28bd6adaf819188dc28 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 19 Jun 2020 21:29:47 +0200 Subject: [PATCH 02/33] [youtube] Move search URL extraction to appropriate extractor --- youtube_dl/extractor/youtube.py | 67 ++++++++++++++++----------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1f16012b20..bb20f74c7b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3146,40 +3146,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _PLAYLIST_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' - - def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): - playlist_json = self._search_regex(self._PLAYLIST_DATA, page, 'ytInitialData') - playlist_response = self._parse_json(playlist_json, None) - - result_items = try_get( - playlist_response, - lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) - - # plobj either contains a 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'shelfRenderer' or 'searchPyvRenderer' (promoted video/ad) - for plobj in result_items: - video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) - video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) - - if video_id is None or video_title is None: - # we do not have a videoRenderer or it is empty - continue - - video_title = video_title.strip() - - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): @@ -3243,6 +3210,7 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' + _SEARCH_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -3254,6 +3222,37 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): 'only_matching': True, }] + def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): + search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) + + result_items = try_get( + search_response, + lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) + + for plobj in result_items: + video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) + video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) + + if video_id is None or video_title is None: + # we do not have a videoRenderer or it is empty + continue + + video_title = video_title.strip() + + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) + return zip(ids_in_page, titles_in_page) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) From 57f72370c510607273157d4ea319adacb6273c58 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Sun, 21 Jun 2020 09:31:04 +0200 Subject: [PATCH 03/33] [youtube] Fix feed extraction This moves feed extraction from using html content to json metadata. However, loading additional pages no longer works. The _extract_video_info function also returns a continuation object that contains some metadata that - together with an API key that is in the page source - might be used to request the next page. --- youtube_dl/extractor/youtube.py | 110 ++++++++++++++++++++++++++------ 1 file changed, 89 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bb20f74c7b..29012bcbee 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3282,10 +3282,12 @@ class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ Base class for feed extractors - Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. + Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties as well as an _extract_video_info function. """ _LOGIN_REQUIRED = True + _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + @property def IE_NAME(self): return 'youtube:%s' % self._FEED_NAME @@ -3296,34 +3298,41 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _entries(self, page): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page + info = [] + for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) + search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: + video_info, continuation = self._extract_video_info(search_response) + + new_info = [] + + for v in video_info: + v_id = try_get(v, lambda x: x['videoId']) + if not v_id: + continue + + have_video = False + for old in info: + if old['videoId'] == v_id: + have_video = True + break + + if not have_video: + new_info.append(v) + + if not new_info: break - ids.extend(new_ids) + info.extend(new_info) - for entry in self._ids_to_results(new_ids): - yield entry + for video in new_info: + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['simpleText'])) - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: + if not continuation: break - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] + # TODO: Fix continuation request to download more pages def _real_extract(self, url): page = self._download_webpage( @@ -3372,6 +3381,32 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' + def _extract_video_info(self, initial_data): + videos = [] + continuation_renderer = None + + renderers = try_get( + initial_data, + lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['richGridRenderer']['contents']) + + for renderer in renderers: + vid = try_get(renderer, lambda x: x['richItemRenderer']['content']['videoRenderer']) + if vid is not None: + videos.append(vid) + continue + + if 'richSectionRenderer' in renderer: + vids = try_get(renderer, lambda x: x['richSectionRenderer']['content']['richShelfRenderer']['contents']) + for v in vids: + vid = try_get(v, lambda x: x['richItemRenderer']['content']['videoRenderer']) + if vid is not None: + videos.append(vid) + continue + + if 'continuationItemRenderer' in renderer: + continuation_renderer = renderer + + return videos, continuation_renderer class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' @@ -3379,6 +3414,23 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = 'Youtube Subscriptions' + def _extract_video_info(self, initial_data): + videos = [] + continuation_renderer = None + + renderers = try_get( + initial_data, + lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) + for renderer in renderers: + for item in try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['shelfRenderer']['content']['gridRenderer']['items']): + vid = try_get(item, lambda x: x['gridVideoRenderer']) + if vid is not None: + videos.append(vid) + + if 'continuationItemRenderer' in renderer: + continuation_renderer = renderer + + return videos, continuation_renderer class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' @@ -3386,6 +3438,22 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'history' _PLAYLIST_TITLE = 'Youtube History' + def _extract_video_info(self, initial_data): + videos = [] + continuation_renderer = None + + renderers = try_get( + initial_data, + lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) + for renderer in renderers: + vid = try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['videoRenderer']) + if vid is not None: + videos.append(vid) + + if 'continuationItemRenderer' in renderer: + continuation_renderer = renderer + + return videos, continuation_renderer class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' From b3fd4b155e7460ffd21e87eb29bc8a95902a429a Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Sun, 21 Jun 2020 09:41:42 +0200 Subject: [PATCH 04/33] run flake8 --- youtube_dl/extractor/youtube.py | 37 ++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 29012bcbee..bd83584629 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3286,7 +3286,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ _LOGIN_REQUIRED = True - _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' @property def IE_NAME(self): @@ -3299,20 +3299,20 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index info = [] - + for page_num in itertools.count(1): search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) video_info, continuation = self._extract_video_info(search_response) - - new_info = [] - + + new_info = [] + for v in video_info: v_id = try_get(v, lambda x: x['videoId']) if not v_id: continue - - have_video = False + + have_video = False for old in info: if old['videoId'] == v_id: have_video = True @@ -3386,15 +3386,15 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): continuation_renderer = None renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['richGridRenderer']['contents']) + initial_data, + lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['richGridRenderer']['contents']) for renderer in renderers: vid = try_get(renderer, lambda x: x['richItemRenderer']['content']['videoRenderer']) if vid is not None: videos.append(vid) - continue - + continue + if 'richSectionRenderer' in renderer: vids = try_get(renderer, lambda x: x['richSectionRenderer']['content']['richShelfRenderer']['contents']) for v in vids: @@ -3402,12 +3402,13 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): if vid is not None: videos.append(vid) continue - + if 'continuationItemRenderer' in renderer: continuation_renderer = renderer return videos, continuation_renderer + class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' @@ -3419,8 +3420,8 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): continuation_renderer = None renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) + initial_data, + lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) for renderer in renderers: for item in try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['shelfRenderer']['content']['gridRenderer']['items']): vid = try_get(item, lambda x: x['gridVideoRenderer']) @@ -3432,6 +3433,7 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): return videos, continuation_renderer + class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' @@ -3441,10 +3443,10 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): def _extract_video_info(self, initial_data): videos = [] continuation_renderer = None - + renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) + initial_data, + lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) for renderer in renderers: vid = try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['videoRenderer']) if vid is not None: @@ -3455,6 +3457,7 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): return videos, continuation_renderer + class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list From 6a3cc8939415e246eacd5a6cc8007d6900f48079 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Tue, 23 Jun 2020 08:56:21 +0200 Subject: [PATCH 05/33] [youtube] Make search extraction less dependent on json schema. If an object looks like a video (it has a `videoId` key), assume that it is. --- youtube_dl/extractor/youtube.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bd83584629..69cc4a0170 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3222,16 +3222,37 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): 'only_matching': True, }] + def _find_videos_in_json(self, extracted): + videos = [] + + def _real_find(obj): + if obj is None or isinstance(obj, str): + return + + if type(obj) is list: + for elem in obj: + _real_find(elem) + + if type(obj) is dict: + if "videoId" in obj: + videos.append(obj) + return + + for _, o in obj.items(): + _real_find(o) + + _real_find(extracted) + + return videos + def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) - result_items = try_get( - search_response, - lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) + result_items = self._find_videos_in_json(search_response) for plobj in result_items: - video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) - video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) + video_id = try_get(plobj, lambda x: x['videoId']) + video_title = try_get(plobj, lambda x: x['title']['runs'][0]['text']) if video_id is None or video_title is None: # we do not have a videoRenderer or it is empty From 5cbe7563bece11e52c833a79b0197ca4444ffe37 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Tue, 23 Jun 2020 11:27:02 +0200 Subject: [PATCH 06/33] [youtube] Return to old feed extraction code as it *seems* like that change was reverted The old code now works again, but it downloads without limit. This is why a limit of 1000 videos is added, it can be overwritten with the `--max-downloads` option - that way, only so many ids will be extracted as videos downloaded --- youtube_dl/extractor/youtube.py | 119 ++++++++------------------------ 1 file changed, 30 insertions(+), 89 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 69cc4a0170..745e14fa3c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3303,7 +3303,7 @@ class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ Base class for feed extractors - Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties as well as an _extract_video_info function. + Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True @@ -3319,41 +3319,44 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _entries(self, page): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index - info = [] + limit = self._downloader.params.get('max_downloads') or 1000 + ids = [] + more_widget_html = content_html = page for page_num in itertools.count(1): - search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) + matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - video_info, continuation = self._extract_video_info(search_response) - - new_info = [] - - for v in video_info: - v_id = try_get(v, lambda x: x['videoId']) - if not v_id: - continue - - have_video = False - for old in info: - if old['videoId'] == v_id: - have_video = True - break - - if not have_video: - new_info.append(v) - - if not new_info: + # 'recommended' feed has infinite 'load more' and each new portion spins + # the same videos in (sometimes) slightly different order, so we'll check + # for unicity and break when portion has no new videos + new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) + if not new_ids: break - info.extend(new_info) + done = False + if len(new_ids) + len(ids) > limit: + new_ids = new_ids[:limit - len(ids)] + done = True - for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['simpleText'])) + ids.extend(new_ids) - if not continuation: + for entry in self._ids_to_results(new_ids): + yield entry + + if done: break - # TODO: Fix continuation request to download more pages + mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) + if not mobj: + break + + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape, + headers=self._YOUTUBE_CLIENT_HEADERS) + content_html = more['content_html'] + more_widget_html = more['load_more_widget_html'] def _real_extract(self, url): page = self._download_webpage( @@ -3402,33 +3405,6 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' - def _extract_video_info(self, initial_data): - videos = [] - continuation_renderer = None - - renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['richGridRenderer']['contents']) - - for renderer in renderers: - vid = try_get(renderer, lambda x: x['richItemRenderer']['content']['videoRenderer']) - if vid is not None: - videos.append(vid) - continue - - if 'richSectionRenderer' in renderer: - vids = try_get(renderer, lambda x: x['richSectionRenderer']['content']['richShelfRenderer']['contents']) - for v in vids: - vid = try_get(v, lambda x: x['richItemRenderer']['content']['videoRenderer']) - if vid is not None: - videos.append(vid) - continue - - if 'continuationItemRenderer' in renderer: - continuation_renderer = renderer - - return videos, continuation_renderer - class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' @@ -3436,24 +3412,6 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = 'Youtube Subscriptions' - def _extract_video_info(self, initial_data): - videos = [] - continuation_renderer = None - - renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) - for renderer in renderers: - for item in try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['shelfRenderer']['content']['gridRenderer']['items']): - vid = try_get(item, lambda x: x['gridVideoRenderer']) - if vid is not None: - videos.append(vid) - - if 'continuationItemRenderer' in renderer: - continuation_renderer = renderer - - return videos, continuation_renderer - class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' @@ -3461,23 +3419,6 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): _FEED_NAME = 'history' _PLAYLIST_TITLE = 'Youtube History' - def _extract_video_info(self, initial_data): - videos = [] - continuation_renderer = None - - renderers = try_get( - initial_data, - lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content']['sectionListRenderer']['contents']) - for renderer in renderers: - vid = try_get(renderer, lambda x: x['itemSectionRenderer']['contents'][0]['videoRenderer']) - if vid is not None: - videos.append(vid) - - if 'continuationItemRenderer' in renderer: - continuation_renderer = renderer - - return videos, continuation_renderer - class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' From c37ca4732bf806113e2645efaebd037a6bcc0b5c Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 10 Jul 2020 11:47:13 +0200 Subject: [PATCH 07/33] [youtube] Remote download limit --- youtube_dl/extractor/youtube.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 745e14fa3c..b53376d310 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3319,8 +3319,6 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _entries(self, page): # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index - limit = self._downloader.params.get('max_downloads') or 1000 - ids = [] more_widget_html = content_html = page for page_num in itertools.count(1): @@ -3333,19 +3331,11 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): if not new_ids: break - done = False - if len(new_ids) + len(ids) > limit: - new_ids = new_ids[:limit - len(ids)] - done = True - ids.extend(new_ids) for entry in self._ids_to_results(new_ids): yield entry - if done: - break - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) if not mobj: break From 7fa0a67cc1e5b5607fb6d30291a549e59e12c9b9 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 10 Jul 2020 11:50:50 +0200 Subject: [PATCH 08/33] Remove unused variable --- youtube_dl/extractor/youtube.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b53376d310..ade6625f3f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3307,8 +3307,6 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ _LOGIN_REQUIRED = True - _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' - @property def IE_NAME(self): return 'youtube:%s' % self._FEED_NAME From 2bd94127a2319a88b5d98719f5e655682aed8b01 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 23 Jun 2020 15:08:50 +0100 Subject: [PATCH 09/33] [bellmedia] add support for cp24.com clip URLs(closes #25764) --- youtube_dl/extractor/bellmedia.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index 485173774d..9f9de96c61 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -25,8 +25,8 @@ class BellMediaIE(InfoExtractor): etalk| marilyn )\.ca| - much\.com - )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})''' + (?:much|cp24)\.com + )/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})''' _TESTS = [{ 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', 'md5': '36d3ef559cfe8af8efe15922cd3ce950', @@ -62,6 +62,9 @@ class BellMediaIE(InfoExtractor): }, { 'url': 'http://www.etalk.ca/video?videoid=663455', 'only_matching': True, + }, { + 'url': 'https://www.cp24.com/video?clipId=1982548', + 'only_matching': True, }] _DOMAINS = { 'thecomedynetwork': 'comedy', From 255f31b5cb42b5c13c1f775b0fa88737283d4526 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 28 Jun 2020 10:30:03 +0700 Subject: [PATCH 10/33] [youtube:playlists] Extend _VALID_URL (closes #25810) --- youtube_dl/extractor/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ade6625f3f..974e009344 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3116,7 +3116,7 @@ class YoutubeLiveIE(YoutubeBaseInfoExtractor): class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com user/channel playlists' - _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' + _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists' IE_NAME = 'youtube:playlists' _TESTS = [{ @@ -3142,6 +3142,9 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): 'title': 'Chem Player', }, 'skip': 'Blocked', + }, { + 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', + 'only_matching': True, }] From bb2c950b8eaac989032725923f8855be73d38596 Mon Sep 17 00:00:00 2001 From: Glenn Slayden <5589855+glenn-slayden@users.noreply.github.com> Date: Tue, 30 Jun 2020 12:56:16 -0700 Subject: [PATCH 11/33] [youtube] Prevent excess HTTP 301 (#25786) --- youtube_dl/extractor/youtube.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 974e009344..dd6f38e627 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -303,7 +303,7 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, @@ -2776,7 +2776,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): ids = [] last_id = playlist_id[-11:] for n in itertools.count(1): - url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) + url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) webpage = self._download_webpage( url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) new_ids = orderedSet(re.findall( @@ -3342,7 +3342,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): break more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, transform_source=uppercase_escape, headers=self._YOUTUBE_CLIENT_HEADERS) From 9fa728f4e89d0d6882a76cb27902029d60455993 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 11 Jul 2020 18:27:19 +0700 Subject: [PATCH 12/33] [wistia] Restrict embed regex (closes #25969) --- youtube_dl/extractor/wistia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 168e5e9015..77febd2eb1 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -56,7 +56,7 @@ class WistiaIE(InfoExtractor): urls.append(unescapeHTML(match.group('url'))) for match in re.finditer( r'''(?sx) - <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]{10})\b.*?\2 + <div[^>]+class=(["'])(?:(?!\1).)*?\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1 ''', webpage): urls.append('wistia:%s' % match.group('id')) for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage): From 54ffcbb8eb06eeeb6b295f08b653e5449c373d47 Mon Sep 17 00:00:00 2001 From: MRWITEK <mrvvitek@gmail.com> Date: Tue, 14 Jul 2020 14:01:15 +0300 Subject: [PATCH 13/33] [youtube] Improve description extraction (closes #25937) (#25980) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index dd6f38e627..368952a69d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1930,7 +1930,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ''', replace_url, video_description) video_description = clean_html(video_description) else: - video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription') + video_description = video_details.get('shortDescription') or self._html_search_meta('description', video_webpage) if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): From 49004a6b59e3e09ce4533618e832cd94b242ba0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 28 Jul 2020 05:04:50 +0700 Subject: [PATCH 14/33] [youtube] Fix sigfunc name extraction (closes #26134, closes #26135, closes #26136, closes #26137) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 368952a69d..cf910ae523 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1384,7 +1384,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(', - r'\b(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', From f4492c48904d441cbacdbc40bf978f674df3a3f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 28 Jul 2020 05:07:54 +0700 Subject: [PATCH 15/33] [ChangeLog] Actualize [ci skip] --- ChangeLog | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ChangeLog b/ChangeLog index 07d6ccd69d..a49904c89f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +version <unreleased> + +Extractors +* [youtube] Fix sigfunc name extraction (#26134, #26135, #26136, #26137) +* [youtube] Improve description extraction (#25937, #25980) +* [wistia] Restrict embed regular expression (#25969) +* [youtube] Prevent excess HTTP 301 (#25786) ++ [youtube:playlists] Extend URL regular expression (#25810) ++ [bellmedia] Add support for cp24.com clip URLs (#25764) +* [brightcove] Improve embed detection (#25674) + + version 2020.06.16.1 Extractors From de722d3cd76c6fd4ba166c98cc681689534ee1a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 28 Jul 2020 05:13:03 +0700 Subject: [PATCH 16/33] release 2020.07.28 --- .github/ISSUE_TEMPLATE/1_broken_site.md | 6 +++--- .github/ISSUE_TEMPLATE/2_site_support_request.md | 4 ++-- .github/ISSUE_TEMPLATE/3_site_feature_request.md | 4 ++-- .github/ISSUE_TEMPLATE/4_bug_report.md | 6 +++--- .github/ISSUE_TEMPLATE/5_feature_request.md | 4 ++-- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index d29d5366fb..f2260db465 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.06.16.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.06.16.1** +- [ ] I've verified that I'm running youtube-dl version **2020.07.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.06.16.1 + [debug] youtube-dl version 2020.07.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index ee882f98cf..8bc05c4ba7 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.06.16.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.06.16.1** +- [ ] I've verified that I'm running youtube-dl version **2020.07.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 23033fe13d..98348e0cd6 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.06.16.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.06.16.1** +- [ ] I've verified that I'm running youtube-dl version **2020.07.28** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 5975313300..86706f5289 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.06.16.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.06.16.1** +- [ ] I've verified that I'm running youtube-dl version **2020.07.28** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.06.16.1 + [debug] youtube-dl version 2020.07.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 5cfcb93186..52c2709f94 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.06.16.1. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.06.16.1** +- [ ] I've verified that I'm running youtube-dl version **2020.07.28** - [ ] I've searched the bugtracker for similar feature requests including closed ones diff --git a/ChangeLog b/ChangeLog index a49904c89f..bf515f784b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2020.07.28 Extractors * [youtube] Fix sigfunc name extraction (#26134, #26135, #26136, #26137) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6b88eb38ca..17101fa475 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.06.16.1' +__version__ = '2020.07.28' From c449f709653bbd28293c8973f14a1c0a38600e58 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Thu, 30 Jul 2020 16:34:48 +0200 Subject: [PATCH 17/33] [youtube] Fix feed extraction In order to extract videos from further pages, we need to get various variables that are in an argument to the `ytcfg.set` call in a script on the feed page. --- youtube_dl/extractor/youtube.py | 96 ++++++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index cf910ae523..de70772c7e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3309,6 +3309,8 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True + _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property def IE_NAME(self): @@ -3317,37 +3319,91 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() + + def _find_videos_in_json(self, extracted): + videos = [] + continuation = None + + def _real_find(obj): + if obj is None or isinstance(obj, str): + return + + if type(obj) is list: + for elem in obj: + _real_find(elem) + + if type(obj) is dict: + if "videoId" in obj: + videos.append(obj) + return + + if "nextContinuationData" in obj: + nonlocal continuation + continuation = obj["nextContinuationData"] + return + + for _, o in obj.items(): + _real_find(o) + + _real_find(extracted) + + return videos, continuation + def _entries(self, page): - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page + info = [] + + yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set'), None) + + search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) + for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) + video_info, continuation = self._find_videos_in_json(search_response) - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: + new_info = [] + + for v in video_info: + v_id = try_get(v, lambda x: x['videoId']) + if not v_id: + continue + + have_video = False + for old in info: + if old['videoId'] == v_id: + have_video = True + break + + if not have_video: + new_info.append(v) + + if not new_info: break - ids.extend(new_ids) + info.extend(new_info) - for entry in self._ids_to_results(new_ids): - yield entry + for video in new_info: + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['simpleText'])) - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: + if not continuation: break - more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + search_response = self._download_json( + 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] + query={ + "ctoken": try_get(continuation, lambda x: x["continuation"]), + "continuation": try_get(continuation, lambda x: x["continuation"]), + "itct": try_get(continuation, lambda x: x["clickTrackingParams"]) + }, + headers={ + "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]), + "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]), + "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]), + "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]), + "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]), + "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]), + "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), + }) def _real_extract(self, url): page = self._download_webpage( From 4f37c60bf5f2af245985d314f0f64f473644feef Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Thu, 30 Jul 2020 16:38:56 +0200 Subject: [PATCH 18/33] Run formatter --- youtube_dl/extractor/youtube.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index de70772c7e..f6bed3f683 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3310,7 +3310,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ _LOGIN_REQUIRED = True _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' - _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" + _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property def IE_NAME(self): @@ -3319,10 +3319,9 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() - def _find_videos_in_json(self, extracted): videos = [] - continuation = None + continuation = None def _real_find(obj): if obj is None or isinstance(obj, str): @@ -3336,19 +3335,19 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): if "videoId" in obj: videos.append(obj) return - + if "nextContinuationData" in obj: nonlocal continuation continuation = obj["nextContinuationData"] - return - + return + for _, o in obj.items(): _real_find(o) _real_find(extracted) return videos, continuation - + def _entries(self, page): info = [] @@ -3359,14 +3358,14 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): for page_num in itertools.count(1): video_info, continuation = self._find_videos_in_json(search_response) - new_info = [] + new_info = [] for v in video_info: v_id = try_get(v, lambda x: x['videoId']) if not v_id: continue - have_video = False + have_video = False for old in info: if old['videoId'] == v_id: have_video = True @@ -3402,7 +3401,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]), "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]), "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]), - "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), + "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), }) def _real_extract(self, url): From a5e386d9feb0e54013ec5aa1ba106869240fb995 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 31 Jul 2020 10:05:11 +0200 Subject: [PATCH 19/33] Fix python2 compatibility and title extraction --- youtube_dl/extractor/youtube.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f6bed3f683..ad8db2c2d8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3321,7 +3321,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _find_videos_in_json(self, extracted): videos = [] - continuation = None + c = {} def _real_find(obj): if obj is None or isinstance(obj, str): @@ -3337,8 +3337,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): return if "nextContinuationData" in obj: - nonlocal continuation - continuation = obj["nextContinuationData"] + c["continuation"] = obj["nextContinuationData"] return for _, o in obj.items(): @@ -3346,7 +3345,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): _real_find(extracted) - return videos, continuation + return videos, try_get(c, lambda x: x["continuation"]) def _entries(self, page): info = [] @@ -3380,7 +3379,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): info.extend(new_info) for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['simpleText'])) + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text'])) if not continuation: break From 7d743516b541cf448bbaaa35ac95f8ecc8139432 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Mon, 24 Aug 2020 14:29:16 +0200 Subject: [PATCH 20/33] [youtube] Make `ytcfg.set` config extraction non-fatal If the markup of the page changes in the future, it might be possible that _FEED_DATA still works, but the other regex does not. SInce it is not necessary for the first page of videos, we make sure the program doesn't exit before extracting them. TL;DR: Extract the first video page even if there are problems --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ad8db2c2d8..ee8a4626d9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3350,7 +3350,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _entries(self, page): info = [] - yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set'), None) + yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False) search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) @@ -3381,7 +3381,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): for video in new_info: yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text'])) - if not continuation: + if not continuation or not yt_conf: break search_response = self._download_json( From 94255fa0b165d0646ae42e9b114f9dddaebc3123 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Mon, 24 Aug 2020 14:30:08 +0200 Subject: [PATCH 21/33] [youtube] More general title extraction Seems like this attribute is moved every few weeks, so we just extract both and use the one that is present. --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ee8a4626d9..8f622662ab 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3379,7 +3379,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): info.extend(new_info) for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text'])) + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText'])) if not continuation or not yt_conf: break From 4c47858c0584f5e38904871f8543f7271d703cc2 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Thu, 3 Sep 2020 20:41:45 +0200 Subject: [PATCH 22/33] Fix regex for other variable declaration type This now supports declarations like `window["ytInitialData"] = ...` and `var ytInitialData = ...` --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8f622662ab..e62096bb27 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3309,7 +3309,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property From b948643f9c069da5bfbe89e2b311c91ca0313262 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 19 Jun 2020 14:57:57 +0200 Subject: [PATCH 23/33] [youtube] Fix extraction of search urls (closes ytdl-org/youtube-dl#25696) --- youtube_dl/extractor/youtube.py | 35 ++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6ae2e58c17..eafd8b7af0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3153,7 +3153,40 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' + _PLAYLIST_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + + def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): + playlist_json = self._search_regex(self._PLAYLIST_DATA, page, 'ytInitialData') + playlist_response = self._parse_json(playlist_json, None) + + result_items = try_get( + playlist_response, + lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) + + # plobj either contains a 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'shelfRenderer' or 'searchPyvRenderer' (promoted video/ad) + for plobj in result_items: + video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) + video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) + + if video_id is None or video_title is None: + # we do not have a videoRenderer or it is empty + continue + + video_title = video_title.strip() + + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) + return zip(ids_in_page, titles_in_page) class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): From 19f671f88b2f45c833a9fc7f6f2f7d9016eccc86 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 19 Jun 2020 21:29:47 +0200 Subject: [PATCH 24/33] [youtube] Move search URL extraction to appropriate extractor --- youtube_dl/extractor/youtube.py | 67 ++++++++++++++++----------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eafd8b7af0..22064616a8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3153,40 +3153,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _PLAYLIST_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' - - def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): - playlist_json = self._search_regex(self._PLAYLIST_DATA, page, 'ytInitialData') - playlist_response = self._parse_json(playlist_json, None) - - result_items = try_get( - playlist_response, - lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) - - # plobj either contains a 'playlistRenderer', 'videoRenderer', 'channelRenderer', 'shelfRenderer' or 'searchPyvRenderer' (promoted video/ad) - for plobj in result_items: - video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) - video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) - - if video_id is None or video_title is None: - # we do not have a videoRenderer or it is empty - continue - - video_title = video_title.strip() - - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): @@ -3250,6 +3217,7 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' + _SEARCH_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -3261,6 +3229,37 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): 'only_matching': True, }] + def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): + search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) + + result_items = try_get( + search_response, + lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) + + for plobj in result_items: + video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) + video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) + + if video_id is None or video_title is None: + # we do not have a videoRenderer or it is empty + continue + + video_title = video_title.strip() + + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) + return zip(ids_in_page, titles_in_page) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) From e03b4f3e056b80b99dd4ab4eed12c7089fb80a43 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Tue, 23 Jun 2020 08:56:21 +0200 Subject: [PATCH 25/33] [youtube] Make search extraction less dependent on json schema. If an object looks like a video (it has a `videoId` key), assume that it is. --- youtube_dl/extractor/youtube.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 22064616a8..be04459627 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3229,16 +3229,37 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): 'only_matching': True, }] + def _find_videos_in_json(self, extracted): + videos = [] + + def _real_find(obj): + if obj is None or isinstance(obj, str): + return + + if type(obj) is list: + for elem in obj: + _real_find(elem) + + if type(obj) is dict: + if "videoId" in obj: + videos.append(obj) + return + + for _, o in obj.items(): + _real_find(o) + + _real_find(extracted) + + return videos + def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) - result_items = try_get( - search_response, - lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents']) + result_items = self._find_videos_in_json(search_response) for plobj in result_items: - video_id = try_get(plobj, lambda x: x['videoRenderer']['videoId']) - video_title = try_get(plobj, lambda x: x['videoRenderer']['title']['runs'][0]['text']) + video_id = try_get(plobj, lambda x: x['videoId']) + video_title = try_get(plobj, lambda x: x['title']['runs'][0]['text']) if video_id is None or video_title is None: # we do not have a videoRenderer or it is empty From 5c430b67bd6befe4c5f257ba40b8d51979c1028c Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Thu, 30 Jul 2020 16:34:48 +0200 Subject: [PATCH 26/33] [youtube] Fix feed extraction In order to extract videos from further pages, we need to get various variables that are in an argument to the `ytcfg.set` call in a script on the feed page. --- youtube_dl/extractor/youtube.py | 96 ++++++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index be04459627..64c4ef32cf 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3313,6 +3313,8 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True + _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property def IE_NAME(self): @@ -3321,37 +3323,91 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() + + def _find_videos_in_json(self, extracted): + videos = [] + continuation = None + + def _real_find(obj): + if obj is None or isinstance(obj, str): + return + + if type(obj) is list: + for elem in obj: + _real_find(elem) + + if type(obj) is dict: + if "videoId" in obj: + videos.append(obj) + return + + if "nextContinuationData" in obj: + nonlocal continuation + continuation = obj["nextContinuationData"] + return + + for _, o in obj.items(): + _real_find(o) + + _real_find(extracted) + + return videos, continuation + def _entries(self, page): - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page + info = [] + + yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set'), None) + + search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) + for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) + video_info, continuation = self._find_videos_in_json(search_response) - # 'recommended' feed has infinite 'load more' and each new portion spins - # the same videos in (sometimes) slightly different order, so we'll check - # for unicity and break when portion has no new videos - new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches))) - if not new_ids: + new_info = [] + + for v in video_info: + v_id = try_get(v, lambda x: x['videoId']) + if not v_id: + continue + + have_video = False + for old in info: + if old['videoId'] == v_id: + have_video = True + break + + if not have_video: + new_info.append(v) + + if not new_info: break - ids.extend(new_ids) + info.extend(new_info) - for entry in self._ids_to_results(new_ids): - yield entry + for video in new_info: + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['simpleText'])) - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: + if not continuation: break - more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, + search_response = self._download_json( + 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] + query={ + "ctoken": try_get(continuation, lambda x: x["continuation"]), + "continuation": try_get(continuation, lambda x: x["continuation"]), + "itct": try_get(continuation, lambda x: x["clickTrackingParams"]) + }, + headers={ + "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]), + "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]), + "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]), + "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]), + "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]), + "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]), + "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), + }) def _real_extract(self, url): page = self._download_webpage( From f536080701c29829d6eebefeb4915307ee44e7d8 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Thu, 30 Jul 2020 16:38:56 +0200 Subject: [PATCH 27/33] Run formatter --- youtube_dl/extractor/youtube.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 64c4ef32cf..d97e0ab4e5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3314,7 +3314,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ _LOGIN_REQUIRED = True _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' - _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" + _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property def IE_NAME(self): @@ -3323,10 +3323,9 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() - def _find_videos_in_json(self, extracted): videos = [] - continuation = None + continuation = None def _real_find(obj): if obj is None or isinstance(obj, str): @@ -3340,19 +3339,19 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): if "videoId" in obj: videos.append(obj) return - + if "nextContinuationData" in obj: nonlocal continuation continuation = obj["nextContinuationData"] - return - + return + for _, o in obj.items(): _real_find(o) _real_find(extracted) return videos, continuation - + def _entries(self, page): info = [] @@ -3363,14 +3362,14 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): for page_num in itertools.count(1): video_info, continuation = self._find_videos_in_json(search_response) - new_info = [] + new_info = [] for v in video_info: v_id = try_get(v, lambda x: x['videoId']) if not v_id: continue - have_video = False + have_video = False for old in info: if old['videoId'] == v_id: have_video = True @@ -3406,7 +3405,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]), "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]), "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]), - "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), + "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), }) def _real_extract(self, url): From 299056ad52222911eea22db0b1a0715bef7572ef Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Fri, 31 Jul 2020 10:05:11 +0200 Subject: [PATCH 28/33] Fix python2 compatibility and title extraction --- youtube_dl/extractor/youtube.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d97e0ab4e5..ec631cd229 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3325,7 +3325,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _find_videos_in_json(self, extracted): videos = [] - continuation = None + c = {} def _real_find(obj): if obj is None or isinstance(obj, str): @@ -3341,8 +3341,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): return if "nextContinuationData" in obj: - nonlocal continuation - continuation = obj["nextContinuationData"] + c["continuation"] = obj["nextContinuationData"] return for _, o in obj.items(): @@ -3350,7 +3349,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): _real_find(extracted) - return videos, continuation + return videos, try_get(c, lambda x: x["continuation"]) def _entries(self, page): info = [] @@ -3384,7 +3383,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): info.extend(new_info) for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['simpleText'])) + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text'])) if not continuation: break From 1f93faf60bb1447ff1aa661e46916e863640ade2 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Mon, 24 Aug 2020 14:29:16 +0200 Subject: [PATCH 29/33] [youtube] Make `ytcfg.set` config extraction non-fatal If the markup of the page changes in the future, it might be possible that _FEED_DATA still works, but the other regex does not. SInce it is not necessary for the first page of videos, we make sure the program doesn't exit before extracting them. TL;DR: Extract the first video page even if there are problems --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ec631cd229..ec821cbc04 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3354,7 +3354,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _entries(self, page): info = [] - yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set'), None) + yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False) search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) @@ -3385,7 +3385,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): for video in new_info: yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text'])) - if not continuation: + if not continuation or not yt_conf: break search_response = self._download_json( From f442082a50f94fc3c36db954764b70d6a08beaa1 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Mon, 24 Aug 2020 14:30:08 +0200 Subject: [PATCH 30/33] [youtube] More general title extraction Seems like this attribute is moved every few weeks, so we just extract both and use the one that is present. --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ec821cbc04..c8d80bbd2b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3383,7 +3383,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): info.extend(new_info) for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text'])) + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText'])) if not continuation or not yt_conf: break From bea9b00588a2d5376c8edeaa968d4c484db415c8 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Thu, 3 Sep 2020 20:41:45 +0200 Subject: [PATCH 31/33] Fix regex for other variable declaration type This now supports declarations like `window["ytInitialData"] = ...` and `var ytInitialData = ...` --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c8d80bbd2b..c03ca5b31c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3313,7 +3313,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - _FEED_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property From c0a1a8926d91b7d1656240bbfc880b160811a3b9 Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Tue, 22 Sep 2020 20:52:52 +0200 Subject: [PATCH 32/33] Use better regex for all fixed extraction types --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c03ca5b31c..1f9cc73717 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3217,7 +3217,7 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _SEARCH_DATA = r'window\[\"ytInitialData\"\]\W?=\W?({.*?});' + _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, From 955c4cb6ac87d997e090cb809c21bba8cc6e3e0a Mon Sep 17 00:00:00 2001 From: xarantolus <xarantolus@protonmail.com> Date: Wed, 30 Sep 2020 15:49:51 +0200 Subject: [PATCH 33/33] [youtube/search_url]: improve title extraction --- youtube_dl/extractor/youtube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1f9cc73717..6207585cf5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -3257,12 +3257,12 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): result_items = self._find_videos_in_json(search_response) - for plobj in result_items: - video_id = try_get(plobj, lambda x: x['videoId']) - video_title = try_get(plobj, lambda x: x['title']['runs'][0]['text']) + for renderer in result_items: + video_id = try_get(renderer, lambda x: x['videoId']) + video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText']) if video_id is None or video_title is None: - # we do not have a videoRenderer or it is empty + # we do not have a videoRenderer or title extraction broke continue video_title = video_title.strip()