[ie/ARD] Overhaul extractors (#8878)

Closes #8731, Closes #6784, Closes #2366, Closes #2975, Closes #8760 Authored by: seproDev
2023-12-30 21:44:32 +01:00 · 2023-12-30 21:44:32 +01:00 · 5f009a094f
parent 225cf2b830
commit 5f009a094f
2 changed files with 250 additions and 375 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -134,8 +134,8 @@ from .arcpublishing import ArcPublishingIE
 from .arkena import ArkenaIE
 from .ard import (
    ARDBetaMediathekIE,
    ARDMediathekCollectionIE,
    ARDIE,
    ARDMediathekIE,
 )
 from .arte import (
    ArteTVIE,
--- a/yt_dlp/extractor/ard.py
+++ b/yt_dlp/extractor/ard.py
@ -1,24 +1,23 @@
 import json
 import re
 from functools import partial
 from .common import InfoExtractor
 from .generic import GenericIE
 from ..utils import (
    OnDemandPagedList,
    determine_ext,
    ExtractorError,
    int_or_none,
    join_nonempty,
    make_archive_id,
    parse_duration,
-    qualities,
+    parse_iso8601,
    remove_start,
    str_or_none,
    try_get,
    unified_strdate,
    unified_timestamp,
    update_url,
    update_url_query,
    url_or_none,
    xpath_text,
 )
-from ..compat import compat_etree_fromstring
+from ..utils.traversal import traverse_obj
 class ARDMediathekBaseIE(InfoExtractor):
@ -61,45 +60,6 @@ class ARDMediathekBaseIE(InfoExtractor):
            'subtitles': subtitles,
        }
    def _ARD_extract_episode_info(self, title):
        """Try to extract season/episode data from the title."""
        res = {}
        if not title:
            return res
        for pattern in [
            # Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
            # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
            r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
            # E.g.: title="Fritjof aus Norwegen (2) (AD)"
            # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
            r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
            r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
            # E.g.: title="Folge 25/42: Symmetrie"
            # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
            # E.g.: title="Folge 1063 - Vertrauen"
            # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
            r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
        ]:
            m = re.match(pattern, title)
            if m:
                groupdict = m.groupdict()
                res['season_number'] = int_or_none(groupdict.get('season_number'))
                res['episode_number'] = int_or_none(groupdict.get('episode_number'))
                res['episode'] = str_or_none(groupdict.get('episode'))
                # Build the episode title by removing numeric episode information:
                if groupdict.get('ep_info') and not res['episode']:
                    res['episode'] = str_or_none(
                        title.replace(groupdict.get('ep_info'), ''))
                if res['episode']:
                    res['episode'] = res['episode'].strip()
                break
        # As a fallback use the whole title as the episode name:
        if not res.get('episode'):
            res['episode'] = title.strip()
        return res
    def _extract_formats(self, media_info, video_id):
        type_ = media_info.get('_type')
        media_array = media_info.get('_mediaArray', [])
@ -155,138 +115,6 @@ class ARDMediathekBaseIE(InfoExtractor):
        return formats
 class ARDMediathekIE(ARDMediathekBaseIE):
    IE_NAME = 'ARD:mediathek'
    _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
    _TESTS = [{
        # available till 26.07.2022
        'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822',
        'info_dict': {
            'id': '44726822',
            'ext': 'mp4',
            'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?',
            'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5',
            'duration': 1740,
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        }
    }, {
        'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
        'only_matching': True,
    }, {
        # audio
        'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
        'only_matching': True,
    }, {
        'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
        'only_matching': True,
    }, {
        # audio
        'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
        'only_matching': True,
    }, {
        'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
        'only_matching': True,
    }]
    @classmethod
    def suitable(cls, url):
        return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
    def _real_extract(self, url):
        # determine video id from url
        m = self._match_valid_url(url)
        document_id = None
        numid = re.search(r'documentId=([0-9]+)', url)
        if numid:
            document_id = video_id = numid.group(1)
        else:
            video_id = m.group('video_id')
        webpage = self._download_webpage(url, video_id)
        ERRORS = (
            ('>Leider liegt eine Störung vor.', 'Video %s is unavailable'),
            ('>Der gewünschte Beitrag ist nicht mehr verfügbar.<',
             'Video %s is no longer available'),
        )
        for pattern, message in ERRORS:
            if pattern in webpage:
                raise ExtractorError(message % video_id, expected=True)
        if re.search(r'[\?&]rss($|[=&])', url):
            doc = compat_etree_fromstring(webpage.encode('utf-8'))
            if doc.tag == 'rss':
                return GenericIE()._extract_rss(url, video_id, doc)
        title = self._og_search_title(webpage, default=None) or self._html_search_regex(
            [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
             r'<meta name="dcterms\.title" content="(.*?)"/>',
             r'<h4 class="headline">(.*?)</h4>',
             r'<title[^>]*>(.*?)</title>'],
            webpage, 'title')
        description = self._og_search_description(webpage, default=None) or self._html_search_meta(
            'dcterms.abstract', webpage, 'description', default=None)
        if description is None:
            description = self._html_search_meta(
                'description', webpage, 'meta description', default=None)
        if description is None:
            description = self._html_search_regex(
                r'<p\s+class="teasertext">(.+?)</p>',
                webpage, 'teaser text', default=None)
        # Thumbnail is sometimes not present.
        # It is in the mobile version, but that seems to use a different URL
        # structure altogether.
        thumbnail = self._og_search_thumbnail(webpage, default=None)
        media_streams = re.findall(r'''(?x)
            mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
            "([^"]+)"''', webpage)
        if media_streams:
            QUALITIES = qualities(['lo', 'hi', 'hq'])
            formats = []
            for furl in set(media_streams):
                if furl.endswith('.f4m'):
                    fid = 'f4m'
                else:
                    fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
                    fid = fid_m.group(1) if fid_m else None
                formats.append({
                    'quality': QUALITIES(fid),
                    'format_id': fid,
                    'url': furl,
                })
            info = {
                'formats': formats,
            }
        else:  # request JSON file
            if not document_id:
                video_id = self._search_regex(
                    (r'/play/(?:config|media|sola)/(\d+)', r'contentId["\']\s*:\s*(\d+)'),
                    webpage, 'media id', default=None)
            info = self._extract_media_info(
                'http://www.ardmediathek.de/play/media/%s' % video_id,
                webpage, video_id)
        info.update({
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
        })
        info.update(self._ARD_extract_episode_info(info['title']))
        return info
 class ARDIE(InfoExtractor):
    _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
    _TESTS = [{
@ -399,21 +227,23 @@ class ARDIE(InfoExtractor):
        }
-class ARDBetaMediathekIE(ARDMediathekBaseIE):
+class ARDBetaMediathekIE(InfoExtractor):
    IE_NAME = 'ARDMediathek'
    _VALID_URL = r'''(?x)https://
        (?:(?:beta|www)\.)?ardmediathek\.de/
-        (?:(?P<client>[^/]+)/)?
+        (?:[^/]+/)?
-        (?:player|live|video|(?P<playlist>sendung|serie|sammlung))/
+        (?:player|live|video)/
-        (?:(?P<display_id>(?(playlist)[^?#]+?|[^?#]+))/)?
+        (?:(?P<display_id>[^?#]+)/)?
-        (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)
+        (?P<id>[a-zA-Z0-9]+)
-        (?(playlist)/(?P<season>\d+)?/?(?:[?#]|$))'''
+        /?(?:[?#]|$)'''
    _GEO_COUNTRIES = ['DE']
    _TESTS = [{
        'url': 'https://www.ardmediathek.de/video/filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
        'md5': 'b6e8ab03f2bcc6e1f9e6cef25fcc03c4',
        'info_dict': {
            'display_id': 'filme-im-mdr/liebe-auf-vier-pfoten/mdr-fernsehen',
-            'id': '12939099',
+            'id': 'Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MC80MjIwOTEtNDAyNTM0',
            'title': 'Liebe auf vier Pfoten',
            'description': r're:^Claudia Schmitt, Anwältin in Salzburg',
            'duration': 5222,
@ -422,7 +252,10 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
            'upload_date': '20231130',
            'ext': 'mp4',
            'episode': 'Liebe auf vier Pfoten',
-            'series': 'Filme im MDR'
+            'series': 'Filme im MDR',
            'age_limit': 0,
            'channel': 'MDR',
            '_old_archive_ids': ['ardbetamediathek 12939099'],
        },
    }, {
        'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
@ -443,7 +276,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
        'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
        'md5': '1e73ded21cb79bac065117e80c81dc88',
        'info_dict': {
-            'id': '10049223',
+            'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll',
            'ext': 'mp4',
            'title': 'tagesschau, 20:00 Uhr',
            'timestamp': 1636398000,
@ -454,6 +287,26 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
            'episode': 'tagesschau, 20:00 Uhr',
            'series': 'tagesschau',
            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:fbb21142783b0a49?w=960&ch=ee69108ae344f678',
            'channel': 'ARD-Aktuell',
            '_old_archive_ids': ['ardbetamediathek 10049223'],
        },
    }, {
        'url': 'https://www.ardmediathek.de/video/7-tage/7-tage-unter-harten-jungs/hr-fernsehen/N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3',
        'md5': 'c428b9effff18ff624d4f903bda26315',
        'info_dict': {
            'id': 'N2I2YmM5MzgtNWFlOS00ZGFlLTg2NzMtYzNjM2JlNjk4MDg3',
            'ext': 'mp4',
            'duration': 2700,
            'episode': '7 Tage ... unter harten Jungs',
            'description': 'md5:0f215470dcd2b02f59f4bd10c963f072',
            'upload_date': '20231005',
            'timestamp': 1696491171,
            'display_id': '7-tage/7-tage-unter-harten-jungs/hr-fernsehen',
            'series': '7 Tage ...',
            'channel': 'HR',
            'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:f6e6d5ffac41925c?w=960&ch=fa32ba69bc87989a',
            'title': '7 Tage ... unter harten Jungs',
            '_old_archive_ids': ['ardbetamediathek 94834686'],
        },
    }, {
        'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
@ -470,6 +323,176 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
    }, {
        'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
        'only_matching': True,
    }, {
        'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
        'only_matching': True,
    }]
    def _extract_episode_info(self, title):
        patterns = [
            # Pattern for title like "Homo sapiens (S06/E07) - Originalversion"
            # from: https://www.ardmediathek.de/one/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw
            r'.*(?P<ep_info> \(S(?P<season_number>\d+)/E(?P<episode_number>\d+)\)).*',
            # E.g.: title="Fritjof aus Norwegen (2) (AD)"
            # from: https://www.ardmediathek.de/ard/sammlung/der-krieg-und-ich/68cMkqJdllm639Skj4c7sS/
            r'.*(?P<ep_info> \((?:Folge |Teil )?(?P<episode_number>\d+)(?:/\d+)?\)).*',
            r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:\:| -|) )\"(?P<episode>.+)\".*',
            # E.g.: title="Folge 25/42: Symmetrie"
            # from: https://www.ardmediathek.de/ard/video/grips-mathe/folge-25-42-symmetrie/ard-alpha/Y3JpZDovL2JyLmRlL3ZpZGVvLzMyYzI0ZjczLWQ1N2MtNDAxNC05ZmZhLTFjYzRkZDA5NDU5OQ/
            # E.g.: title="Folge 1063 - Vertrauen"
            # from: https://www.ardmediathek.de/ard/sendung/die-fallers/Y3JpZDovL3N3ci5kZS8yMzAyMDQ4/
            r'.*(?P<ep_info>Folge (?P<episode_number>\d+)(?:/\d+)?(?:\:| -|) ).*',
            # As a fallback use the full title
            r'(?P<title>.*)',
        ]
        return traverse_obj(patterns, (..., {partial(re.match, string=title)}, {
            'season_number': ('season_number', {int_or_none}),
            'episode_number': ('episode_number', {int_or_none}),
            'episode': ((
                ('episode', {str_or_none}),
                ('ep_info', {lambda x: title.replace(x, '')}),
                ('title', {str}),
            ), {str.strip}),
        }), get_all=False)
    def _real_extract(self, url):
        video_id, display_id = self._match_valid_url(url).group('id', 'display_id')
        page_data = self._download_json(
            f'https://api.ardmediathek.de/page-gateway/pages/ard/item/{video_id}', video_id, query={
                'embedded': 'false',
                'mcV6': 'true',
            })
        player_data = traverse_obj(
            page_data, ('widgets', lambda _, v: v['type'] in ('player_ondemand', 'player_live'), {dict}), get_all=False)
        is_live = player_data.get('type') == 'player_live'
        media_data = traverse_obj(player_data, ('mediaCollection', 'embedded', {dict}))
        if player_data.get('blockedByFsk'):
            self.raise_no_formats('This video is only available after 22:00', expected=True)
        formats = []
        subtitles = {}
        for stream in traverse_obj(media_data, ('streams', ..., {dict})):
            kind = stream.get('kind')
            # Prioritize main stream over sign language and others
            preference = 1 if kind == 'main' else None
            for media in traverse_obj(stream, ('media', lambda _, v: url_or_none(v['url']))):
                media_url = media['url']
                audio_kind = traverse_obj(media, (
                    'audios', 0, 'kind', {str}), default='').replace('standard', '')
                lang_code = traverse_obj(media, ('audios', 0, 'languageCode', {str})) or 'deu'
                lang = join_nonempty(lang_code, audio_kind)
                language_preference = 10 if lang == 'deu' else -10
                if determine_ext(media_url) == 'm3u8':
                    fmts, subs = self._extract_m3u8_formats_and_subtitles(
                        media_url, video_id, m3u8_id=f'hls-{kind}', preference=preference, fatal=False, live=is_live)
                    for f in fmts:
                        f['language'] = lang
                        f['language_preference'] = language_preference
                    formats.extend(fmts)
                    self._merge_subtitles(subs, target=subtitles)
                else:
                    formats.append({
                        'url': media_url,
                        'format_id': f'http-{kind}',
                        'preference': preference,
                        'language': lang,
                        'language_preference': language_preference,
                        **traverse_obj(media, {
                            'format_note': ('forcedLabel', {str}),
                            'width': ('maxHResolutionPx', {int_or_none}),
                            'height': ('maxVResolutionPx', {int_or_none}),
                            'vcodec': ('videoCodec', {str}),
                        }),
                    })
        for sub in traverse_obj(media_data, ('subtitles', ..., {dict})):
            for sources in traverse_obj(sub, ('sources', lambda _, v: url_or_none(v['url']))):
                subtitles.setdefault(sub.get('languageCode') or 'deu', []).append({
                    'url': sources['url'],
                    'ext': {'webvtt': 'vtt', 'ebutt': 'ttml'}.get(sources.get('kind')),
                })
        age_limit = traverse_obj(page_data, ('fskRating', {lambda x: remove_start(x, 'FSK')}, {int_or_none}))
        old_id = traverse_obj(page_data, ('tracking', 'atiCustomVars', 'contentId'))
        return {
            'id': video_id,
            'display_id': display_id,
            'formats': formats,
            'subtitles': subtitles,
            'is_live': is_live,
            'age_limit': age_limit,
            **traverse_obj(media_data, ('meta', {
                'title': 'title',
                'description': 'synopsis',
                'timestamp': ('broadcastedOnDateTime', {parse_iso8601}),
                'series': 'seriesTitle',
                'thumbnail': ('images', 0, 'url', {url_or_none}),
                'duration': ('durationSeconds', {int_or_none}),
                'channel': 'clipSourceName',
            })),
            **self._extract_episode_info(page_data.get('title')),
            '_old_archive_ids': [make_archive_id(ARDBetaMediathekIE, old_id)],
        }
 class ARDMediathekCollectionIE(InfoExtractor):
    _VALID_URL = r'''(?x)https://
        (?:(?:beta|www)\.)?ardmediathek\.de/
        (?:[^/?#]+/)?
        (?P<playlist>sendung|serie|sammlung)/
        (?:(?P<display_id>[^?#]+?)/)?
        (?P<id>[a-zA-Z0-9]+)
        (?:/(?P<season>\d+)(?:/(?P<version>OV|AD))?)?/?(?:[?#]|$)'''
    _GEO_COUNTRIES = ['DE']
    _TESTS = [{
        'url': 'https://www.ardmediathek.de/serie/quiz/staffel-1-originalversion/Y3JpZDovL3dkci5kZS9vbmUvcXVpeg/1/OV',
        'info_dict': {
            'id': 'Y3JpZDovL3dkci5kZS9vbmUvcXVpeg_1_OV',
            'display_id': 'quiz/staffel-1-originalversion',
            'title': 'Staffel 1 Originalversion',
        },
        'playlist_count': 3,
    }, {
        'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-4-mit-audiodeskription/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/4/AD',
        'info_dict': {
            'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_4_AD',
            'display_id': 'babylon-berlin/staffel-4-mit-audiodeskription',
            'title': 'Staffel 4 mit Audiodeskription',
        },
        'playlist_count': 12,
    }, {
        'url': 'https://www.ardmediathek.de/serie/babylon-berlin/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu/1/',
        'info_dict': {
            'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JhYnlsb24tYmVybGlu_1',
            'display_id': 'babylon-berlin/staffel-1',
            'title': 'Staffel 1',
        },
        'playlist_count': 8,
    }, {
        'url': 'https://www.ardmediathek.de/sendung/tatort/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA',
        'info_dict': {
            'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydA',
            'display_id': 'tatort',
            'title': 'Tatort',
        },
        'playlist_mincount': 500,
    }, {
        'url': 'https://www.ardmediathek.de/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2',
        'info_dict': {
            'id': '5eOHzt8XB2sqeFXbIoJlg2',
            'display_id': 'die-kirche-bleibt-im-dorf',
            'title': 'Die Kirche bleibt im Dorf',
            'description': 'Die Kirche bleibt im Dorf',
        },
        'playlist_count': 4,
    }, {
        # playlist of type 'sendung'
        'url': 'https://www.ardmediathek.de/ard/sendung/doctor-who/Y3JpZDovL3dkci5kZS9vbmUvZG9jdG9yIHdobw/',
@ -482,196 +505,48 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
        # playlist of type 'sammlung'
        'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/',
        'only_matching': True,
    }, {
        'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
        'only_matching': True,
    }, {
        'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
        'only_matching': True,
    }]
-    def _ARD_load_playlist_snippet(self, playlist_id, display_id, client, mode, page_number):
+    _PAGE_SIZE = 100
        """ Query the ARD server for playlist information
        and returns the data in "raw" format """
        assert mode in ('sendung', 'serie', 'sammlung')
        if mode in ('sendung', 'serie'):
            graphQL = json.dumps({
                'query': '''{
                    showPage(
                        client: "%s"
                        showId: "%s"
                        pageNumber: %d
                    ) {
                        pagination {
                            pageSize
                            totalElements
                        }
                        teasers {        # Array
                            mediumTitle
                            links { target { id href title } }
                            type
                        }
                    }}''' % (client, playlist_id, page_number),
            }).encode()
        else:  # mode == 'sammlung'
            graphQL = json.dumps({
                'query': '''{
                    morePage(
                        client: "%s"
                        compilationId: "%s"
                        pageNumber: %d
                    ) {
                        widget {
                            pagination {
                                pageSize
                                totalElements
                            }
                            teasers {        # Array
                                mediumTitle
                                links { target { id href title } }
                                type
                            }
                        }
                    }}''' % (client, playlist_id, page_number),
            }).encode()
        # Ressources for ARD graphQL debugging:
        # https://api-test.ardmediathek.de/public-gateway
        show_page = self._download_json(
            'https://api.ardmediathek.de/public-gateway',
            '[Playlist] %s' % display_id,
            data=graphQL,
            headers={'Content-Type': 'application/json'})['data']
        # align the structure of the returned data:
        if mode in ('sendung', 'serie'):
            show_page = show_page['showPage']
        else:  # mode == 'sammlung'
            show_page = show_page['morePage']['widget']
        return show_page
    def _ARD_extract_playlist(self, url, playlist_id, display_id, client, mode):
        """ Collects all playlist entries and returns them as info dict.
        Supports playlists of mode 'sendung', 'serie', and 'sammlung',
        as well as nested playlists. """
        entries = []
        pageNumber = 0
        while True:  # iterate by pageNumber
            show_page = self._ARD_load_playlist_snippet(
                playlist_id, display_id, client, mode, pageNumber)
            for teaser in show_page['teasers']:  # process playlist items
                if '/compilation/' in teaser['links']['target']['href']:
                    # alternativ cond.: teaser['type'] == "compilation"
                    # => This is an nested compilation, e.g. like:
                    # https://www.ardmediathek.de/ard/sammlung/die-kirche-bleibt-im-dorf/5eOHzt8XB2sqeFXbIoJlg2/
                    link_mode = 'sammlung'
                else:
                    link_mode = 'video'
                item_url = 'https://www.ardmediathek.de/%s/%s/%s/%s/%s' % (
                    client, link_mode, display_id,
                    # perform HTLM quoting of episode title similar to ARD:
                    re.sub('^-|-$', '',  # remove '-' from begin/end
                           re.sub('[^a-zA-Z0-9]+', '-',  # replace special chars by -
                                  teaser['links']['target']['title'].lower()
                                  .replace('ä', 'ae').replace('ö', 'oe')
                                  .replace('ü', 'ue').replace('ß', 'ss'))),
                    teaser['links']['target']['id'])
                entries.append(self.url_result(
                    item_url,
                    ie=ARDBetaMediathekIE.ie_key()))
            if (show_page['pagination']['pageSize'] * (pageNumber + 1)
               >= show_page['pagination']['totalElements']):
                # we've processed enough pages to get all playlist entries
                break
            pageNumber = pageNumber + 1
        return self.playlist_result(entries, playlist_id, playlist_title=display_id)
    def _real_extract(self, url):
-        video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group(
+        playlist_id, display_id, playlist_type, season_number, version = self._match_valid_url(url).group(
-            'id', 'display_id', 'playlist', 'client', 'season')
+            'id', 'display_id', 'playlist', 'season', 'version')
        display_id, client = display_id or video_id, client or 'ard'
-        if playlist_type:
+        def call_api(page_num):
-            # TODO: Extract only specified season
+            api_path = 'compilations/ard' if playlist_type == 'sammlung' else 'widgets/ard/asset'
-            return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type)
+            return self._download_json(
                f'https://api.ardmediathek.de/page-gateway/{api_path}/{playlist_id}', playlist_id,
                f'Downloading playlist page {page_num}', query={
                    'pageNumber': page_num,
                    'pageSize': self._PAGE_SIZE,
                    **({
                        'seasoned': 'true',
                        'seasonNumber': season_number,
                        'withOriginalversion': 'true' if version == 'OV' else 'false',
                        'withAudiodescription': 'true' if version == 'AD' else 'false',
                    } if season_number else {}),
                })
-        player_page = self._download_json(
+        def fetch_page(page_num):
-            'https://api.ardmediathek.de/public-gateway',
+            for item in traverse_obj(call_api(page_num), ('teasers', ..., {dict})):
-            display_id, data=json.dumps({
+                item_id = traverse_obj(item, ('links', 'target', ('urlId', 'id')), 'id', get_all=False)
-                'query': '''{
+                if not item_id or item_id == playlist_id:
-  playerPage(client:"%s", clipId: "%s") {
+                    continue
-    blockedByFsk
+                item_mode = 'sammlung' if item.get('type') == 'compilation' else 'video'
-    broadcastedOn
+                yield self.url_result(
-    maturityContentRating
+                    f'https://www.ardmediathek.de/{item_mode}/{item_id}',
-    mediaCollection {
+                    ie=(ARDMediathekCollectionIE if item_mode == 'sammlung' else ARDBetaMediathekIE),
-      _duration
+                    **traverse_obj(item, {
-      _geoblocked
+                        'id': ('id', {str}),
-      _isLive
+                        'title': ('longTitle', {str}),
-      _mediaArray {
+                        'duration': ('duration', {int_or_none}),
-        _mediaStreamArray {
+                        'timestamp': ('broadcastedOn', {parse_iso8601}),
-          _quality
+                    }))
          _server
          _stream
        }
      }
      _previewImage
      _subtitleUrl
      _type
    }
    show {
      title
    }
    image {
      src
    }
    synopsis
    title
    tracking {
      atiCustomVars {
        contentId
      }
    }
  }
 }''' % (client, video_id),
            }).encode(), headers={
                'Content-Type': 'application/json'
            })['data']['playerPage']
        title = player_page['title']
        content_id = str_or_none(try_get(
            player_page, lambda x: x['tracking']['atiCustomVars']['contentId']))
        media_collection = player_page.get('mediaCollection') or {}
        if not media_collection and content_id:
            media_collection = self._download_json(
                'https://www.ardmediathek.de/play/media/' + content_id,
                content_id, fatal=False) or {}
        info = self._parse_media_info(
            media_collection, content_id or video_id,
            player_page.get('blockedByFsk'))
        age_limit = None
        description = player_page.get('synopsis')
        maturity_content_rating = player_page.get('maturityContentRating')
        if maturity_content_rating:
            age_limit = int_or_none(maturity_content_rating.lstrip('FSK'))
        if not age_limit and description:
            age_limit = int_or_none(self._search_regex(
                r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
        info.update({
            'age_limit': age_limit,
            'display_id': display_id,
            'title': title,
            'description': description,
            'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
            'series': try_get(player_page, lambda x: x['show']['title']),
            'thumbnail': (media_collection.get('_previewImage')
                          or try_get(player_page, lambda x: update_url(x['image']['src'], query=None, fragment=None))
                          or self.get_thumbnail_from_html(display_id, url)),
        })
        info.update(self._ARD_extract_episode_info(info['title']))
        return info
-    def get_thumbnail_from_html(self, display_id, url):
+        page_data = call_api(0)
-        webpage = self._download_webpage(url, display_id, fatal=False) or ''
+        full_id = join_nonempty(playlist_id, season_number, version, delim='_')
-        return (
+
-            self._og_search_thumbnail(webpage, default=None)
+        return self.playlist_result(
-            or self._html_search_meta('thumbnailUrl', webpage, default=None))
+            OnDemandPagedList(fetch_page, self._PAGE_SIZE), full_id, display_id=display_id,
            title=page_data.get('title'), description=page_data.get('synopsis'))