From 4b4b7f746c1a05d1c842639e0bc4ed427fe609e3 Mon Sep 17 00:00:00 2001 From: Benedikt Wildenhain Date: Fri, 24 Dec 2021 23:05:23 +0100 Subject: [PATCH] [OpenCast] Add extractors (#1905) Original PR: https://github.com/ytdl-org/youtube-dl/pull/26934 Authored by: bwildenhain, C0D3D3V --- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/extractor/extractors.py | 4 + yt_dlp/extractor/opencast.py | 177 +++++++++++++++++++++++++++++++++ yt_dlp/utils.py | 2 +- 4 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 yt_dlp/extractor/opencast.py diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 6fcd52b995..4d6152e6e5 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1151,7 +1151,7 @@ class YoutubeDL(object): str_fmt = f'{fmt[:-1]}s' if fmt[-1] == 'l': # list delim = '\n' if '#' in flags else ', ' - value, fmt = delim.join(variadic(value)), str_fmt + value, fmt = delim.join(variadic(value, allowed_types=(str, bytes))), str_fmt elif fmt[-1] == 'j': # json value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt elif fmt[-1] == 'q': # quoted diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 2a907bc572..6798e0f379 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -1040,6 +1040,10 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) +from .opencast import ( + OpencastIE, + OpencastPlaylistIE, +) from .openrec import ( OpenRecIE, OpenRecCaptureIE, diff --git a/yt_dlp/extractor/opencast.py b/yt_dlp/extractor/opencast.py new file mode 100644 index 0000000000..cf8d917176 --- /dev/null +++ b/yt_dlp/extractor/opencast.py @@ -0,0 +1,177 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + parse_iso8601, + traverse_obj, + variadic, +) + + +class OpencastBaseIE(InfoExtractor): + _INSTANCES_RE = r'''(?: + opencast\.informatik\.kit\.edu| + electures\.uni-muenster\.de| + oc-presentation\.ltcc\.tuwien\.ac\.at| + medien\.ph-noe\.ac\.at| + oc-video\.ruhr-uni-bochum\.de| + oc-video1\.ruhr-uni-bochum\.de| + opencast\.informatik\.uni-goettingen\.de| + heicast\.uni-heidelberg\.de| + opencast\.hawk\.de:8080| + opencast\.hs-osnabrueck\.de| + video[0-9]+\.virtuos\.uni-osnabrueck\.de| + opencast\.uni-koeln\.de| + media\.opencast\.hochschule-rhein-waal\.de| + matterhorn\.dce\.harvard\.edu| + hs-harz\.opencast\.uni-halle\.de| + videocampus\.urz\.uni-leipzig\.de| + media\.uct\.ac\.za| + vid\.igb\.illinois\.edu| + cursosabertos\.c3sl\.ufpr\.br| + mcmedia\.missioncollege\.org| + clases\.odon\.edu\.uy + )''' + _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' + + def _call_api(self, host, video_id, **kwargs): + return self._download_json(self._API_BASE % (host, video_id), video_id, **kwargs) + + def _parse_mediapackage(self, video): + video_id = video.get('id') + if video_id is None: + raise ExtractorError('Video id was not found') + + formats = [] + for track in variadic(traverse_obj(video, ('media', 'track')) or []): + href = track.get('url') + if href is None: + continue + ext = determine_ext(href, None) + + transport = track.get('transport') + + if transport == 'DASH' or ext == 'mpd': + formats.extend(self._extract_mpd_formats_and_subtitles(href, video_id, mpd_id='dash', fatal=False)) + elif transport == 'HLS' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats_and_subtitles( + href, video_id, m3u8_id='hls', entry_protocol='m3u8_native', fatal=False)) + elif transport == 'HDS' or ext == 'f4m': + formats.extend(self._extract_f4m_formats(href, video_id, f4m_id='hds', fatal=False)) + elif transport == 'SMOOTH': + formats.extend(self._extract_ism_formats(href, video_id, ism_id='smooth', fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats(href, video_id, fatal=False)) + else: + track_obj = { + 'url': href, + 'ext': ext, + 'format_note': track.get('transport'), + 'resolution': traverse_obj(track, ('video', 'resolution')), + 'fps': int_or_none(traverse_obj(track, ('video', 'framerate'))), + 'vbr': int_or_none(traverse_obj(track, ('video', 'bitrate')), scale=1000), + 'vcodec': traverse_obj(track, ('video', 'encoder', 'type')) if track.get('video') else 'none', + 'abr': int_or_none(traverse_obj(track, ('audio', 'bitrate')), scale=1000), + 'asr': int_or_none(traverse_obj(track, ('audio', 'samplingrate'))), + 'acodec': traverse_obj(track, ('audio', 'encoder', 'type')) if track.get('audio') else 'none', + } + + if transport == 'RTMP': + m_obj = re.search(r'(?:rtmp://[^/]+/(?P[^/]+))/(?P.+):(?P.+)', href) + if not m_obj: + continue + track_obj.update({ + 'app': m_obj.group('app'), + 'ext': m_obj.group('ext'), + 'play_path': m_obj.group('ext') + ':' + m_obj.group('playpath'), + 'rtmp_live': True, + 'preference': -2, + }) + formats.append(track_obj) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': video.get('title'), + 'series': video.get('seriestitle'), + 'season_id': video.get('series'), + 'creator': traverse_obj(video, ('creators', 'creator')), + 'timestamp': parse_iso8601(video.get('start')), + 'thumbnail': traverse_obj(video, ('attachments', 'attachment', ..., 'url'), get_all=False), + } + + +class OpencastIE(OpencastBaseIE): + _VALID_URL = r'''(?x) + https?://(?P%s)/paella/ui/watch.html\?.*? + id=(?P%s) + ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE) + + _API_BASE = 'https://%s/search/episode.json?id=%s' + + _TESTS = [ + { + 'url': 'https://oc-video1.ruhr-uni-bochum.de/paella/ui/watch.html?id=ed063cd5-72c8-46b5-a60a-569243edcea8', + 'md5': '554c8e99a90f7be7e874619fcf2a3bc9', + 'info_dict': { + 'id': 'ed063cd5-72c8-46b5-a60a-569243edcea8', + 'ext': 'mp4', + 'title': '11 - Kryptographie - 24.11.2015', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1606208400, + 'upload_date': '20201124', + }, + } + ] + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).group('host', 'id') + return self._parse_mediapackage( + self._call_api(host, video_id)['search-results']['result']['mediapackage']) + + +class OpencastPlaylistIE(OpencastBaseIE): + _VALID_URL = r'''(?x) + https?://(?P%s)/engage/ui/index.html\?.*? + epFrom=(?P%s) + ''' % (OpencastBaseIE._INSTANCES_RE, OpencastBaseIE._UUID_RE) + + _API_BASE = 'https://%s/search/episode.json?sid=%s' + + _TESTS = [ + { + 'url': 'https://oc-video1.ruhr-uni-bochum.de/engage/ui/index.html?epFrom=cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', + 'info_dict': { + 'id': 'cf68a4a1-36b1-4a53-a6ba-61af5705a0d0', + 'title': 'Kryptographie - WiSe 15/16', + }, + 'playlist_mincount': 28, + }, + { + 'url': 'https://oc-video.ruhr-uni-bochum.de/engage/ui/index.html?e=1&p=1&epFrom=b1a54262-3684-403f-9731-8e77c3766f9a', + 'info_dict': { + 'id': 'b1a54262-3684-403f-9731-8e77c3766f9a', + 'title': 'inSTUDIES-Social movements and prefigurative politics in a global perspective', + }, + 'playlist_mincount': 6, + }, + ] + + def _real_extract(self, url): + host, video_id = self._match_valid_url(url).group('host', 'id') + + entries = [ + self._parse_mediapackage(episode['mediapackage']) + for episode in variadic(self._call_api(host, video_id)['search-results']['result']) + if episode.get('mediapackage') + ] + + return self.playlist_result(entries, video_id, traverse_obj(entries, (0, 'series'))) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 29b1b9d450..2c5e6560a4 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -5024,7 +5024,7 @@ def traverse_dict(dictn, keys, casesense=True): return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True) -def variadic(x, allowed_types=(str, bytes)): +def variadic(x, allowed_types=(str, bytes, dict)): return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)