From a49e777d592ea8f0a21832b08ba2e70456d9914e Mon Sep 17 00:00:00 2001 From: Felix S Date: Thu, 14 Apr 2022 13:22:47 +0000 Subject: [PATCH] [spotify] Detect iframe embeds (#3430) Authored by: fstirlitz --- yt_dlp/extractor/generic.py | 6 ++++++ yt_dlp/extractor/spotify.py | 15 ++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index c708b4cee8..8192fbb860 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -67,6 +67,7 @@ from .simplecast import SimplecastIE from .soundcloud import SoundcloudEmbedIE from .spankwire import SpankwireIE from .sportbox import SportBoxIE +from .spotify import SpotifyBaseIE from .springboardplatform import SpringboardPlatformIE from .svt import SVTIE from .teachable import TeachableIE @@ -3164,6 +3165,11 @@ class GenericIE(InfoExtractor): if sportbox_urls: return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key()) + # Look for embedded Spotify player + spotify_urls = SpotifyBaseIE._extract_embed_urls(webpage) + if spotify_urls: + return self.playlist_from_matches(spotify_urls, video_id, video_title) + # Look for embedded XHamster player xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) if xhamster_urls: diff --git a/yt_dlp/extractor/spotify.py b/yt_dlp/extractor/spotify.py index 3b8dea8f49..3128825e5d 100644 --- a/yt_dlp/extractor/spotify.py +++ b/yt_dlp/extractor/spotify.py @@ -19,7 +19,7 @@ class SpotifyBaseIE(InfoExtractor): 'MinimalShow': '13ee079672fad3f858ea45a55eb109553b4fb0969ed793185b2e34cbb6ee7cc0', 'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d', } - _VALID_URL_TEMPL = r'https?://open\.spotify\.com/%s/(?P[^/?&#]+)' + _VALID_URL_TEMPL = r'https?://open\.spotify\.com/(?:embed-podcast/|embed/|)%s/(?P[^/?&#]+)' def _real_initialize(self): self._ACCESS_TOKEN = self._download_json( @@ -93,11 +93,17 @@ class SpotifyBaseIE(InfoExtractor): 'series': series, } + @classmethod + def _extract_embed_urls(cls, webpage): + return re.findall( + r']+src="(https?://open\.spotify.com/embed/[^"]+)"', + webpage) + class SpotifyIE(SpotifyBaseIE): IE_NAME = 'spotify' _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'episode' - _TEST = { + _TESTS = [{ 'url': 'https://open.spotify.com/episode/4Z7GAJ50bgctf6uclHlWKo', 'md5': '74010a1e3fa4d9e1ab3aa7ad14e42d3b', 'info_dict': { @@ -109,7 +115,10 @@ class SpotifyIE(SpotifyBaseIE): 'release_date': '20201217', 'series': "The Guardian's Audio Long Reads", } - } + }, { + 'url': 'https://open.spotify.com/embed/episode/4TvCsKKs2thXmarHigWvXE?si=7eatS8AbQb6RxqO2raIuWA', + 'only_matching': True, + }] def _real_extract(self, url): episode_id = self._match_id(url)