import re from .common import InfoExtractor from ..utils import parse_duration, unified_strdate class RadioFranceIE(InfoExtractor): _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P[^?#]+)' IE_NAME = 'radiofrance' _TEST = { 'url': 'http://maison.radiofrance.fr/radiovisions/one-one', 'md5': 'bdbb28ace95ed0e04faab32ba3160daf', 'info_dict': { 'id': 'one-one', 'ext': 'ogg', 'title': 'One to one', 'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.", 'uploader': 'Thomas Hercouët', }, } def _real_extract(self, url): m = self._match_valid_url(url) video_id = m.group('id') webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r'

(.*?)

', webpage, 'title') description = self._html_search_regex( r'
(.*?)
', webpage, 'description', fatal=False) uploader = self._html_search_regex( r'
  © (.*?)
', webpage, 'uploader', fatal=False) formats_str = self._html_search_regex( r'class="jp-jplayer[^"]*" data-source="([^"]+)">', webpage, 'audio URLs') formats = [ { 'format_id': fm[0], 'url': fm[1], 'vcodec': 'none', 'quality': i, } for i, fm in enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)) ] return { 'id': video_id, 'title': title, 'formats': formats, 'description': description, 'uploader': uploader, } class FranceCultureIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/(?:franceculture|fip|francemusique|mouv|franceinter)/podcasts/(?:[^?#]+/)?(?P[^?#]+)-(?P\d+)($|[?#])' _TESTS = [ { 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487', 'info_dict': { 'id': '8440487', 'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau', 'ext': 'mp3', 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?', 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?', 'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg', 'upload_date': '20220514', 'duration': 2750, }, }, { 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507', 'only_matching': True, } ] def _real_extract(self, url): video_id, display_id = self._match_valid_url(url).group('id', 'display_id') webpage = self._download_webpage(url, display_id) # _search_json_ld doesn't correctly handle this. See https://github.com/yt-dlp/yt-dlp/pull/3874#discussion_r891903846 video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'{\s*"@type"\s*:\s*"AudioObject".+}') return { 'id': video_id, 'display_id': display_id, 'url': video_data['contentUrl'], 'ext': video_data.get('encodingFormat'), 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None, 'duration': parse_duration(video_data.get('duration')), 'title': self._html_search_regex(r'(?s)]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)', webpage, 'title', default=self._og_search_title(webpage)), 'description': self._html_search_regex( r'(?s)(.*?)', webpage, 'uploader', default=None), 'upload_date': unified_strdate(self._search_regex( r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)) }