From d05460e5fee0dca9ab7463c78c630653fb37dcde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Grosdidier?= Date: Sat, 11 Jun 2022 01:22:34 +0200 Subject: [PATCH] [extractor/FranceCulture] Fix extractor (#3874) Closes #3742 Authored by: aurelg, pukkandan --- yt_dlp/extractor/common.py | 4 +- yt_dlp/extractor/franceculture.py | 137 +++++++----------------------- 2 files changed, 31 insertions(+), 110 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 316b58ce3..f63be0975 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1188,11 +1188,11 @@ class InfoExtractor: self.report_warning('unable to extract %s' % _name + bug_reports_message()) return None - def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', fatal=True, **kwargs): + def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', contains_pattern='.+', fatal=True, **kwargs): """Searches string for the JSON object specified by start_pattern""" # NB: end_pattern is only used to reduce the size of the initial match return self._parse_json( - self._search_regex(rf'{start_pattern}\s*(?P{{.+}})\s*{end_pattern}', + self._search_regex(rf'{start_pattern}\s*(?P{{{contains_pattern}}})\s*{end_pattern}', string, name, group='json', fatal=fatal) or '{}', video_id, fatal=fatal, ignore_extra=True, **kwargs) or {} diff --git a/yt_dlp/extractor/franceculture.py b/yt_dlp/extractor/franceculture.py index 6bd9912f3..ccb98e22c 100644 --- a/yt_dlp/extractor/franceculture.py +++ b/yt_dlp/extractor/franceculture.py @@ -1,125 +1,46 @@ -import re from .common import InfoExtractor -from ..utils import ( - determine_ext, - extract_attributes, - int_or_none, - traverse_obj, - unified_strdate, -) +from ..utils import int_or_none, parse_duration, unified_strdate class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P[^/?#&]+)' - _TESTS = [{ - # playlist - 'url': 'https://www.franceculture.fr/emissions/serie/hasta-dente', - 'playlist_count': 12, - 'info_dict': { - 'id': 'hasta-dente', - 'title': 'Hasta Dente', - 'description': 'md5:57479af50648d14e9bb649e6b1f8f911', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20201024', - }, - 'playlist': [{ + _VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/franceculture/podcasts/(?:[^?#]+/)?(?P[^?#]+)-(?P\d+)($|[?#])' + _TESTS = [ + { + 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487', 'info_dict': { - 'id': '3c1c2e55-41a0-11e5-9fe0-005056a87c89', + 'id': '8440487', + 'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau', 'ext': 'mp3', - 'title': 'Jeudi, vous avez dit bizarre ?', - 'description': 'md5:47cf1e00cc21c86b0210279996a812c6', - 'duration': 604, - 'upload_date': '20201024', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1603576680 + 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?', + 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?', + 'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg', + 'upload_date': '20220514', + 'duration': 2750, }, }, - ], - }, { - 'url': 'https://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', - 'info_dict': { - 'id': 'rendez-vous-au-pays-des-geeks', - 'display_id': 'rendez-vous-au-pays-des-geeks', - 'ext': 'mp3', - 'title': 'Rendez-vous au pays des geeks', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20140301', - 'vcodec': 'none', - 'duration': 3569, - }, - }, { - # no thumbnail - 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', - 'only_matching': True, - }] + ] def _real_extract(self, url): - display_id = self._match_id(url) + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') webpage = self._download_webpage(url, display_id) - info = { - 'id': display_id, - 'title': self._html_search_regex( - r'(?s)]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)', - webpage, 'title', default=self._og_search_title(webpage)), + # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846 + video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'\s*"@type"\s*:\s*"AudioObject"\s*.+') + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_data['contentUrl'], + 'ext': video_data.get('encodingFormat'), + 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None, + 'duration': parse_duration(video_data.get('duration')), + 'title': self._html_search_regex(r'(?s)]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)', + webpage, 'title', default=self._og_search_title(webpage)), 'description': self._html_search_regex( - r'(?s)]+class="excerpt"[^>]*>(.*?)', webpage, 'description', default=None), + r'(?s)(.*?)', webpage, 'uploader', default=None), - 'upload_date': unified_strdate(self._html_search_regex( - r'(?s)class="teaser-text-date".*?(\d{2}/\d{2}/\d{4})', webpage, 'date', default=None)), - } - - playlist_data = self._search_regex( - r'''(?sx) - ]+data-xiti-place="[^"]*?liste_episodes[^"?]*?"[^>]*> - (.*?) - - ''', - webpage, 'playlist data', fatal=False, default=None) - - if playlist_data: - entries = [] - for item, item_description in re.findall( - r'(?s)(]*>).*?]*class="[^"]*teaser-text-chapo[^>]*>(.*?)

', - playlist_data): - - item_attributes = extract_attributes(item) - entries.append({ - 'id': item_attributes.get('data-emission-uuid'), - 'url': item_attributes.get('data-url'), - 'title': item_attributes.get('data-diffusion-title'), - 'duration': int_or_none(traverse_obj(item_attributes, 'data-duration-seconds', 'data-duration-seconds')), - 'description': item_description, - 'timestamp': int_or_none(item_attributes.get('data-start-time')), - 'thumbnail': info['thumbnail'], - 'uploader': info['uploader'], - }) - - return { - '_type': 'playlist', - 'entries': entries, - **info - } - - video_data = extract_attributes(self._search_regex( - r'''(?sx) - (?: - | - ]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> - ).*? - (]+data-(?:url|asset-source)="[^"]+"[^>]+>) - ''', - webpage, 'video data')) - video_url = traverse_obj(video_data, 'data-url', 'data-asset-source') - ext = determine_ext(video_url.lower()) - - return { - 'display_id': display_id, - 'url': video_url, - 'ext': ext, - 'vcodec': 'none' if ext == 'mp3' else None, - 'duration': int_or_none(video_data.get('data-duration')), - **info + 'upload_date': unified_strdate(self._search_regex( + r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)) }