diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 42034275b..e3824c445 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1981,6 +1981,10 @@ from .theplatform import ( from .thestar import TheStarIE from .thesun import TheSunIE from .theweatherchannel import TheWeatherChannelIE +from .thirtydaysinger import ( + ThirtyDaySingerIE, + ThirtyDaySingerPlaylistIE +) from .thisamericanlife import ThisAmericanLifeIE from .thisoldhouse import ThisOldHouseIE from .thisvid import ( diff --git a/yt_dlp/extractor/thirtydaysinger.py b/yt_dlp/extractor/thirtydaysinger.py new file mode 100644 index 000000000..9d57e9b53 --- /dev/null +++ b/yt_dlp/extractor/thirtydaysinger.py @@ -0,0 +1,107 @@ +import re + +from .wistia import WistiaBaseIE +from ..utils import ( + clean_html, + get_elements_html_by_class +) + + +class ThirtyDaySingerBase(WistiaBaseIE): + _INDEX_EXTRACTION_RE = r'/tutorial/[\w-]+/(?P[\w-]+)' + + def _extract_for_url(self, url): + lesson_index = re.search(self._INDEX_EXTRACTION_RE, url).group('index') + webpage = self._download_webpage(url, lesson_index) + match = next(self._extract_wistia_async_embed(webpage)) + embed_config = self._download_embed_config('medias', match.group('id'), url) + + embed_infojson = self._extract_media(embed_config) + webpage_infojson = self._extract_webpage_data(webpage) + + return {**embed_infojson, **webpage_infojson} + + def _extract_webpage_data(self, webpage): + title = self._html_search_regex(r'

([^<]+)

', webpage, 'title') + fallback_title = self._html_extract_title(webpage) + description = self._html_search_meta('description', webpage, fatal=False) + + return { + 'title': title or fallback_title, + 'description': clean_html(self._format_html_list(description)) + } + + # The site makes extensive use of HTML lists for formatting and `clean_html` + # doesn't handle them well. This is needed to keep lists readable. + def _format_html_list(self, html): + replacements = { + '