From bcf597ea1736395bbb893af16a012f10e7a2b544 Mon Sep 17 00:00:00 2001 From: dirkf Date: Wed, 8 Feb 2023 18:16:51 +0000 Subject: [PATCH] [YouTube] Fix tests --- youtube_dl/extractor/youtube.py | 55 ++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6c70a98d1..ba0f5c8b6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -27,6 +27,8 @@ from ..utils import ( dict_get, error_to_compat_str, float_or_none, + extract_attributes, + get_element_by_attribute, int_or_none, js_to_json, mimetype2ext, @@ -38,6 +40,7 @@ from ..utils import ( smuggle_url, str_or_none, str_to_int, + traverse_obj, try_get, unescapeHTML, unified_strdate, @@ -656,6 +659,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:bf77e03fcae5529475e500129b05668a', 'duration': 177, 'uploader': 'FlyingKitty', + 'uploader_id': 'FlyingKitty900', 'upload_date': '20200408', 'thumbnail': 'https://i.ytimg.com/vi/HsUATh_Nc2U/maxresdefault.jpg', 'age_limit': 18, @@ -678,6 +682,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:17eccca93a786d51bc67646756894066', 'duration': 106, 'uploader': 'Projekt Melody', + 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', 'upload_date': '20191227', 'age_limit': 18, 'thumbnail': 'https://i.ytimg.com/vi/Tq92D6wQ1mg/sddefault.jpg', @@ -929,16 +934,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'lsguqyKfVQg', 'ext': 'mp4', 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', - 'alt_title': 'Dark Walk - Position Music', + 'alt_title': 'Dark Walk', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'duration': 133, 'upload_date': '20151119', 'uploader_id': 'IronSoulElf', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', - 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', - 'track': 'Dark Walk - Position Music', - 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan', + 'creator': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan', + 'track': 'Dark Walk', + 'artist': r're:Todd Haberman[;,]\s+Daniel Law Heath and Aaron Kaplan', 'album': 'Position Music - Production Music Vol. 143 - Dark Walk', }, 'params': { @@ -2091,7 +2096,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or microformat.get('lengthSeconds')) \ or parse_duration(search_meta('duration')) is_live = video_details.get('isLive') - owner_profile_url = microformat.get('ownerProfileUrl') + + def gen_owner_profile_url(): + yield microformat.get('ownerProfileUrl') + yield extract_attributes(self._search_regex( + r'''(?s)(]+\bitemprop\s*=\s*("|')url\2[^>]*>)''', + get_element_by_attribute('itemprop', 'author', webpage), + 'owner_profile_url', default='')).get('href') + + owner_profile_url = next( + (x for x in map(url_or_none, gen_owner_profile_url()) if x), + None) if not player_url: player_url = self._extract_player_url(webpage) @@ -2176,6 +2191,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info[d_k] = parse_duration(query[k][0]) if video_description: + # Youtube Music Auto-generated description mobj = re.search(r'(?s)(?P[^·\n]+)·(?P[^\n]+)\n+(?P[^\n]+)(?:.+?℗\s*(?P\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) if mobj: release_year = mobj.group('release_year') @@ -2250,7 +2266,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': info['location'] = stl else: - mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) + # •? doesn't match, but [•]? does; \xa0 = non-breaking space + mobj = re.search(r'([^\xa0\s].*?)[\xa0\s]*S(\d+)[\xa0\s]*[•]?[\xa0\s]*E(\d+)', stl) if mobj: info.update({ 'series': mobj.group(1), @@ -2261,7 +2278,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): vpir, lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], list) or []): - tbr = tlb.get('toggleButtonRenderer') or {} + tbr = traverse_obj(tlb, ('segmentedLikeDislikeButtonRenderer', 'likeButton', 'toggleButtonRenderer'), 'toggleButtonRenderer') or {} for getter, regex in [( lambda x: x['defaultText']['accessibility']['accessibilityData'], r'(?P[\d,]+)\s*(?P(?:dis)?like)'), ([ @@ -2315,6 +2332,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif mrr_title == 'Song': info['track'] = mrr_contents_text + # this is not extraction but spelunking! + carousel_lockups = traverse_obj( + initial_data, + ('engagementPanels', Ellipsis, 'engagementPanelSectionListRenderer', + 'content', 'structuredDescriptionContentRenderer', 'items', Ellipsis, + 'videoDescriptionMusicSectionRenderer', 'carouselLockups', Ellipsis), + expected_type=dict) or [] + # try to reproduce logic from metadataRowContainerRenderer above (if it still is) + fields = (('ALBUM', 'album'), ('ARTIST', 'artist'), ('SONG', 'track'), ('LICENSES', 'license')) + # multiple_songs ? + if len(carousel_lockups) > 1: + fields = fields[-1:] + for info_row in traverse_obj( + carousel_lockups, + (0, 'carouselLockupRenderer', 'infoRows', Ellipsis, 'infoRowRenderer'), + expected_type=dict): + row_title = traverse_obj(info_row, ('title', 'simpleText')) + row_text = traverse_obj(info_row, 'defaultMetadata', 'expandedMetadata', expected_type=get_text) + if not row_text: + continue + for name, field in fields: + if name == row_title and not info.get(field): + info[field] = row_text + for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]: v = info.get(s_k) if v: