From d53a263d124ec1e561deccd9d6adf7e0491fa6c7 Mon Sep 17 00:00:00 2001 From: jahway603 Date: Sat, 9 Dec 2023 14:23:13 -0500 Subject: [PATCH] [Beatport] Fix song downloading to resolve Issue #8564 --- yt_dlp/extractor/beatport.py | 98 ++++++++++++++++++++++++++++++++---- yt_dlp/utils/__init__.py | 8 +++ 2 files changed, 97 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/beatport.py b/yt_dlp/extractor/beatport.py index 0aecbd089..b8a02a3f5 100644 --- a/yt_dlp/extractor/beatport.py +++ b/yt_dlp/extractor/beatport.py @@ -1,20 +1,32 @@ -import re - from .common import InfoExtractor from ..compat import compat_str -from ..utils import int_or_none +from ..utils import ( + determine_ext, + int_or_none, + join_nonempty, + merge_dicts, + parse_iso8601, + T, + traverse_obj, + txt_or_none, + unified_strdate, + url_or_none, + variadic, +) class BeatportIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.|pro\.)?beatport\.com/track/(?P[^/]+)/(?P[0-9]+)' _TESTS = [{ 'url': 'https://beatport.com/track/synesthesia-original-mix/5379371', - 'md5': 'b3c34d8639a2f6a7f734382358478887', + 'md5': 'cfcc245aafcad52a837b2c5a60a472c9', 'info_dict': { 'id': '5379371', 'display_id': 'synesthesia-original-mix', - 'ext': 'mp4', + 'ext': 'mp3', 'title': 'Froxic - Synesthesia (Original Mix)', + 'timestamp': 1397854513, + 'upload_date': '20140428', }, }, { 'url': 'https://beatport.com/track/love-and-war-original-mix/3756896', @@ -24,19 +36,85 @@ class BeatportIE(InfoExtractor): 'display_id': 'love-and-war-original-mix', 'ext': 'mp3', 'title': 'Wolfgang Gartner - Love & War (Original Mix)', + 'timestamp': 1346195831, + 'upload_date': '20120917', }, }, { 'url': 'https://beatport.com/track/birds-original-mix/4991738', - 'md5': 'a1fd8e8046de3950fd039304c186c05f', + 'md5': '2dff00955b13c182931a708d979801b6', 'info_dict': { 'id': '4991738', 'display_id': 'birds-original-mix', - 'ext': 'mp4', + 'ext': 'mp3', 'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)", + 'timestamp': 1386121876, + 'upload_date': '20131209', } }] def _real_extract(self, url): + mobj = self._match_valid_url(url) + track_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + next_data = self._search_nextjs_data(webpage, display_id, fatal=False) + if not next_data: + return self._old_real_extract(url) + + track = traverse_obj( + next_data, + ('props', 'pageProps', lambda k, v: k == 'track' and v['id'] == int(track_id)), + get_all=False) + + title = track['name'] + artists = ', '.join(traverse_obj(track, ('artists', Ellipsis, 'name', T(txt_or_none)))) or None + title = join_nonempty(artists, title, delim=' - ') + title = join_nonempty( + title, traverse_obj(track, ('mix_name', T(lambda s: '(' + s + ')'))), + delim=' ') + + formats = [] + # next.js page has <= 1 sample URL + f_url = traverse_obj(track, ('sample_url', T(url_or_none))) + if f_url: + ext = determine_ext(f_url) + fmt = { + 'url': f_url, + 'ext': ext, + 'format_id': ext, + 'vcodec': 'none', + } + if ext == 'mp3': + fmt['preference'] = 0 + fmt['acodec'] = 'mp3' + fmt['abr'] = 96 + fmt['asr'] = 44100 + elif ext == 'mp4': + fmt['preference'] = 1 + fmt['acodec'] = 'aac' + fmt['abr'] = 96 + fmt['asr'] = 44100 + formats.append(fmt) + self._sort_formats(formats) + + return merge_dicts({ + 'id': track_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'artists': artists, + }, traverse_obj(track, { + 'disc_number': ('catalog_number', T(int_or_none)), + 'timestamp': ('encoded_date', T(parse_iso8601)), + 'categories': ('genre', 'name', T(txt_or_none), T(variadic)), + 'thumbnail': ('image', 'uri', T(url_or_none)), + 'upload_date': (('new_release_date', 'publish_date'), T(unified_strdate)), + 'track_number': ('number', T(int_or_none)), + 'album': ('release', 'name', T(txt_or_none)), + }, get_all=False)) + + def _old_real_extract(self, url): mobj = self._match_valid_url(url) track_id = mobj.group('id') display_id = mobj.group('display_id') @@ -45,8 +123,10 @@ class BeatportIE(InfoExtractor): playables = self._parse_json( self._search_regex( - r'window\.Playables\s*=\s*({.+?});', webpage, - 'playables info', flags=re.DOTALL), +# r'window\.Playables\s*=\s*({.+?});', webpage, +# 'playables info', flags=re.DOTALL), + r'(?s)window\.Playables\s*=\s*({.+?});', webpage, + 'playables info'), track_id) track = next(t for t in playables['tracks'] if t['id'] == int(track_id)) diff --git a/yt_dlp/utils/__init__.py b/yt_dlp/utils/__init__.py index c267e326f..e355438e5 100644 --- a/yt_dlp/utils/__init__.py +++ b/yt_dlp/utils/__init__.py @@ -8,3 +8,11 @@ del passthrough_module from .traversal import * from ._utils import * from ._utils import _configuration_args, _get_exe_version_output # noqa: F401 + +def txt_or_none(v, default=None): + """ Combine str/strip_or_none, disallow blank value (for traverse_obj) """ + return default if v is None else (str(v).strip() or default) + +def T(x): + """ For use in yt-dl instead of {type} or set((type,)) """ + return set((x,))