From c0837a12c8a64c682a01e4bfdee6f22615568d69 Mon Sep 17 00:00:00 2001 From: Boris Wachtmeister Date: Sat, 12 Mar 2016 18:00:26 +0100 Subject: [PATCH 1/8] [WDR] complete overhaul after relaunch of the site The WDR relaunched their site on 2016-02-23 which not only changed the URL-schema completely but also the layout of their pages. Apparently the whole "mediathek" now runs on the wdr-domain, so no separate URL for funkhauseuropa anymore. There seems to be no explicit handling of video-sizes on the page or in the URLs anymore. There seems to be only one size for HTML5, but still several sizes for flash. The extractor adds all to the list of formats. There is no metadata for the HTML5-stream, so that the best flash-stream will always be considered as the "best" format. At least in my tests this seemed to be true anyway. --- youtube_dl/extractor/wdr.py | 251 +++++++++++++++--------------------- 1 file changed, 101 insertions(+), 150 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 31c9043032..f881b7300e 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import itertools import re from .common import InfoExtractor @@ -11,204 +10,156 @@ from ..compat import ( ) from ..utils import ( unified_strdate, - qualities, + ExtractorError, ) class WDRIE(InfoExtractor): - _PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?' - _VALID_URL = r'(?Phttps?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P.+?)(?P%s)?\.html' % _PLAYER_REGEX + _PAGE_REGEX = r'/mediathek/(?P[^/]+)/(?P[^/]+)/(?P.+)\.html' + _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + + _JS_URL_REGEX = r'(https?://deviceids-medp.wdr.de/ondemand/\d+/\d+\.js)' _TESTS = [ { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html', + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html', + 'md5': 'e58c39c3e30077141d258bf588700a7b', 'info_dict': { - 'id': 'mdb-362427', + 'id': 'mdb-1058683', 'ext': 'flv', - 'title': 'Servicezeit', - 'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb', - 'upload_date': '20140310', - 'is_live': False - }, - 'params': { - 'skip_download': True, + 'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100', + 'title': 'Geheimnis Aachener Dom', + 'alt_title': 'Doku am Freitag', + 'upload_date': '20160304', + 'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318', + 'is_live': False, + 'subtitles': {'de': [{ + 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml' + }]}, }, 'skip': 'Page Not Found', }, { - 'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html', + 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html', + 'md5': 'f4c1f96d01cf285240f53ea4309663d8', 'info_dict': { - 'id': 'mdb-363194', + 'id': 'mdb-1072000', + 'ext': 'mp3', + 'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100', + 'title': 'Schriftstellerin Juli Zeh', + 'alt_title': 'WDR 3 Gespräch am Samstag', + 'upload_date': '20160312', + 'description': 'md5:e127d320bc2b1f149be697ce044a3dd7', + 'is_live': False, + 'subtitles': {} + }, + 'skip': 'Page Not Found', + }, + { + 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', + 'info_dict': { + 'id': 'mdb-103364', 'ext': 'flv', - 'title': 'Marga Spiegel ist tot', - 'description': 'md5:2309992a6716c347891c045be50992e4', - 'upload_date': '20140311', - 'is_live': False - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Page Not Found', - }, - { - 'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html', - 'md5': '83e9e8fefad36f357278759870805898', - 'info_dict': { - 'id': 'mdb-194332', - 'ext': 'mp3', - 'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)', - 'description': 'md5:2309992a6716c347891c045be50992e4', - 'upload_date': '20091129', - 'is_live': False - }, - }, - { - 'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html', - 'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa', - 'info_dict': { - 'id': 'mdb-478135', - 'ext': 'mp3', - 'title': 'Flavia Coelho: Amar é Amar', - 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', - 'upload_date': '20140717', - 'is_live': False - }, - 'skip': 'Page Not Found', - }, - { - 'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html', - 'playlist_mincount': 146, - 'info_dict': { - 'id': 'mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100', + 'display_id': 'index', + 'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'alt_title': 'WDR Fernsehen Live', + 'upload_date': None, + 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', + 'is_live': True, + 'subtitles': {} } }, { - 'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html', + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', + 'playlist_mincount': 10, 'info_dict': { - 'id': 'mdb-103364', - 'title': 're:^WDR Fernsehen Live [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', - 'ext': 'flv', - 'upload_date': '20150101', - 'is_live': True - }, - 'params': { - 'skip_download': True, + 'id': 'aktuelle-stunde/aktuelle-stunde-120', }, } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - page_url = mobj.group('url') - page_id = mobj.group('id') + url_type = mobj.group('type') + page_url = mobj.group('page_url') + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) - webpage = self._download_webpage(url, page_id) + js_url = self._search_regex(self._JS_URL_REGEX, webpage, 'js_url', default=None) - if mobj.group('player') is None: + if not js_url: entries = [ - self.url_result(page_url + href, 'WDR') + self.url_result(page_url + href[0], 'WDR') for href in re.findall( - r'\s*]*>\s*\s*]+href="([^"]+)"', - webpage, 'm3u8 url', default=None) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, page_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - direct_urls = re.findall( - r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage) - if direct_urls: - for quality, video_url in direct_urls: - formats.append({ - 'url': video_url, - 'preference': preference(quality), - 'http_headers': { - 'User-Agent': 'mobile', - }, - }) - self._sort_formats(formats) - description = self._html_search_meta('Description', webpage, 'description') - return { - 'id': page_id, - 'formats': formats, + 'id': metadata_tracker_data.get("trackerClipId", display_id), + 'display_id': display_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'alt_title': metadata_tracker_data.get("trackerClipSubcategory"), + 'formats': formats, 'upload_date': upload_date, - 'is_live': is_live + 'description': self._html_search_meta("Description", webpage), + 'is_live': is_live, + 'subtitles': subtitles, } From 14f7a2b8af17d1f490c46a0a9028ba9d97cf7df2 Mon Sep 17 00:00:00 2001 From: Boris Wachtmeister Date: Sat, 12 Mar 2016 20:14:46 +0100 Subject: [PATCH 2/8] [WDRMaus] switch current show to new WDR extractor (fixes #8562) It seems that the "current show" already uses the new WDR video-player, while all the others videos still use the old player. I just added the current show URL to the normal WDR-extractor, which works fine. This commit needs my changes from PR #8842 that fix the support for WDR. --- youtube_dl/extractor/wdr.py | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index f881b7300e..ec81f1a28a 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -15,8 +15,9 @@ from ..utils import ( class WDRIE(InfoExtractor): + _CURRENT_MAUS_URL = r'https?://www.wdrmaus.de/aktuelle-sendung/(wdr|index).php5' _PAGE_REGEX = r'/mediathek/(?P[^/]+)/(?P[^/]+)/(?P.+)\.html' - _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + "|" + _CURRENT_MAUS_URL _JS_URL_REGEX = r'(https?://deviceids-medp.wdr.de/ondemand/\d+/\d+\.js)' @@ -75,7 +76,18 @@ class WDRIE(InfoExtractor): 'info_dict': { 'id': 'aktuelle-stunde/aktuelle-stunde-120', }, - } + }, + { + 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', + 'info_dict': { + 'id': 'mdb-1096487', + 'ext': 'flv', + 'upload_date': 're:^[0-9]{8}$', + 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', + 'description': '- Die Sendung mit der Maus -', + }, + 'skip': 'The id changes from week to week because of the new episode' + }, ] def _real_extract(self, url): @@ -195,26 +207,17 @@ class WDRMobileIE(InfoExtractor): class WDRMausIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P[^/?#]+)(?:/index\.php5|(?[^/?#]+)((? Date: Thu, 26 May 2016 16:45:14 +0200 Subject: [PATCH 3/8] [WDR] use single quotes for strings --- youtube_dl/extractor/wdr.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index ec81f1a28a..05bfe7deb3 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -17,7 +17,7 @@ from ..utils import ( class WDRIE(InfoExtractor): _CURRENT_MAUS_URL = r'https?://www.wdrmaus.de/aktuelle-sendung/(wdr|index).php5' _PAGE_REGEX = r'/mediathek/(?P[^/]+)/(?P[^/]+)/(?P.+)\.html' - _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + "|" + _CURRENT_MAUS_URL + _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL _JS_URL_REGEX = r'(https?://deviceids-medp.wdr.de/ondemand/\d+/\d+\.js)' @@ -116,23 +116,23 @@ class WDRIE(InfoExtractor): json_data = self._search_regex(r'\(({.*})\)', js_data, 'json') metadata = self._parse_json(json_data, display_id) - metadata_tracker_data = metadata["trackerData"] - metadata_media_resource = metadata["mediaResource"] + metadata_tracker_data = metadata['trackerData'] + metadata_media_resource = metadata['mediaResource'] formats = [] # check if the metadata contains a direct URL to a file - metadata_media_alt = metadata_media_resource.get("alt") + metadata_media_alt = metadata_media_resource.get('alt') if metadata_media_alt: - for tag_name in ["videoURL", 'audioURL']: + for tag_name in ['videoURL', 'audioURL']: if tag_name in metadata_media_alt: formats.append({ 'url': metadata_media_alt[tag_name] }) # check if there are flash-streams for this video - if "dflt" in metadata_media_resource and "videoURL" in metadata_media_resource["dflt"]: - video_url = metadata_media_resource["dflt"]["videoURL"] + if 'dflt' in metadata_media_resource and 'videoURL' in metadata_media_resource['dflt']: + video_url = metadata_media_resource['dflt']['videoURL'] if video_url.endswith('.f4m'): full_video_url = video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18' formats.extend(self._extract_f4m_formats(full_video_url, display_id, f4m_id='hds', fatal=False)) @@ -140,13 +140,13 @@ class WDRIE(InfoExtractor): formats.extend(self._extract_smil_formats(video_url, 'stream', fatal=False)) subtitles = {} - caption_url = metadata_media_resource.get("captionURL") + caption_url = metadata_media_resource.get('captionURL') if caption_url: subtitles['de'] = [{ 'url': caption_url }] - title = metadata_tracker_data.get("trackerClipTitle") + title = metadata_tracker_data.get('trackerClipTitle') is_live = url_type == 'live' if is_live: @@ -163,13 +163,13 @@ class WDRIE(InfoExtractor): self._sort_formats(formats) return { - 'id': metadata_tracker_data.get("trackerClipId", display_id), + 'id': metadata_tracker_data.get('trackerClipId', display_id), 'display_id': display_id, 'title': title, - 'alt_title': metadata_tracker_data.get("trackerClipSubcategory"), + 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'), 'formats': formats, 'upload_date': upload_date, - 'description': self._html_search_meta("Description", webpage), + 'description': self._html_search_meta('Description', webpage), 'is_live': is_live, 'subtitles': subtitles, } From 37f972954da0d0f1f0c5e97da8357c4baf687ee6 Mon Sep 17 00:00:00 2001 From: Boris Wachtmeister Date: Thu, 26 May 2016 16:59:45 +0200 Subject: [PATCH 4/8] [WDR] use _download_json with a strip_jsonp --- youtube_dl/extractor/wdr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 05bfe7deb3..73a343c69b 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -9,6 +9,7 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + strip_jsonp, unified_strdate, ExtractorError, ) @@ -112,9 +113,8 @@ class WDRIE(InfoExtractor): raise ExtractorError('No downloadable streams found', expected=True) - js_data = self._download_webpage(js_url, 'metadata') - json_data = self._search_regex(r'\(({.*})\)', js_data, 'json') - metadata = self._parse_json(json_data, display_id) + metadata = self._download_json( + js_url, 'metadata', transform_source=strip_jsonp) metadata_tracker_data = metadata['trackerData'] metadata_media_resource = metadata['mediaResource'] From bec2c14f2cf4f06f1b99e04d59779d8d103d726a Mon Sep 17 00:00:00 2001 From: Boris Wachtmeister Date: Thu, 26 May 2016 17:30:38 +0200 Subject: [PATCH 5/8] [WDR] add special handling if alt-url is a m3u8 --- youtube_dl/extractor/wdr.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 73a343c69b..fddcbf1907 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -9,6 +9,7 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + determine_ext, strip_jsonp, unified_strdate, ExtractorError, @@ -61,7 +62,7 @@ class WDRIE(InfoExtractor): 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { 'id': 'mdb-103364', - 'ext': 'flv', + 'ext': 'mp4', 'display_id': 'index', 'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'alt_title': 'WDR Fernsehen Live', @@ -69,7 +70,10 @@ class WDRIE(InfoExtractor): 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', 'is_live': True, 'subtitles': {} - } + }, + 'params': { + 'skip_download': True, # m3u8 download + }, }, { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', @@ -126,9 +130,16 @@ class WDRIE(InfoExtractor): if metadata_media_alt: for tag_name in ['videoURL', 'audioURL']: if tag_name in metadata_media_alt: - formats.append({ - 'url': metadata_media_alt[tag_name] - }) + alt_url = metadata_media_alt[tag_name] + if determine_ext(alt_url) == 'm3u8': + m3u_fmt = self._extract_m3u8_formats( + alt_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls') + formats.extend(m3u_fmt) + else: + formats.append({ + 'url': alt_url + }) # check if there are flash-streams for this video if 'dflt' in metadata_media_resource and 'videoURL' in metadata_media_resource['dflt']: From 33a1ff7113d9dd656b3c56cb404de85646caa559 Mon Sep 17 00:00:00 2001 From: Boris Wachtmeister Date: Thu, 26 May 2016 19:08:12 +0200 Subject: [PATCH 6/8] [WDR] extract jsonp-url by parsing data-extension of mediaLink --- youtube_dl/extractor/wdr.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index fddcbf1907..dd107ef8af 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -10,6 +10,7 @@ from ..compat import ( ) from ..utils import ( determine_ext, + js_to_json, strip_jsonp, unified_strdate, ExtractorError, @@ -21,8 +22,6 @@ class WDRIE(InfoExtractor): _PAGE_REGEX = r'/mediathek/(?P[^/]+)/(?P[^/]+)/(?P.+)\.html' _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL - _JS_URL_REGEX = r'(https?://deviceids-medp.wdr.de/ondemand/\d+/\d+\.js)' - _TESTS = [ { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html', @@ -102,9 +101,13 @@ class WDRIE(InfoExtractor): display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - js_url = self._search_regex(self._JS_URL_REGEX, webpage, 'js_url', default=None) + # for wdr.de the data-extension is in a tag with the class "mediaLink" + # for wdrmaus its in a link to the page in a multiline "videoLink"-tag + json_metadata = self._html_search_regex( + r'class=(?:"mediaLink\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', + webpage, 'media link', default=None, flags=re.MULTILINE) - if not js_url: + if not json_metadata: entries = [ self.url_result(page_url + href[0], 'WDR') for href in re.findall( @@ -117,8 +120,12 @@ class WDRIE(InfoExtractor): raise ExtractorError('No downloadable streams found', expected=True) + media_link_obj = self._parse_json(json_metadata, display_id, + transform_source=js_to_json) + jsonp_url = media_link_obj['mediaObj']['url'] + metadata = self._download_json( - js_url, 'metadata', transform_source=strip_jsonp) + jsonp_url, 'metadata', transform_source=strip_jsonp) metadata_tracker_data = metadata['trackerData'] metadata_media_resource = metadata['mediaResource'] From 949fc42e009aed5414caad280d0dc551ffcd9c14 Mon Sep 17 00:00:00 2001 From: Boris Wachtmeister Date: Thu, 26 May 2016 19:58:55 +0200 Subject: [PATCH 7/8] [WDR] the other wdrmaus.de pages also changed to the new player --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/wdr.py | 89 +++++------------------------- 2 files changed, 15 insertions(+), 75 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6de3438fc8..023598130e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -923,7 +923,6 @@ from .wat import WatIE from .wdr import ( WDRIE, WDRMobileIE, - WDRMausIE, ) from .webofstories import ( WebOfStoriesIE, diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index dd107ef8af..1af1e996d8 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -4,10 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urlparse, -) from ..utils import ( determine_ext, js_to_json, @@ -18,7 +14,7 @@ from ..utils import ( class WDRIE(InfoExtractor): - _CURRENT_MAUS_URL = r'https?://www.wdrmaus.de/aktuelle-sendung/(wdr|index).php5' + _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' _PAGE_REGEX = r'/mediathek/(?P[^/]+)/(?P[^/]+)/(?P.+)\.html' _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL @@ -92,6 +88,20 @@ class WDRIE(InfoExtractor): }, 'skip': 'The id changes from week to week because of the new episode' }, + { + 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5', + 'md5': 'ca365705551e4bd5217490f3b0591290', + 'info_dict': { + 'id': 'mdb-186083', + 'ext': 'flv', + 'upload_date': '20130919', + 'title': 'Sachgeschichte - Achterbahn ', + 'description': '- Die Sendung mit der Maus -', + }, + 'params': { + 'skip_download': True, # the file has different versions :( + }, + }, ] def _real_extract(self, url): @@ -222,72 +232,3 @@ class WDRMobileIE(InfoExtractor): 'User-Agent': 'mobile', }, } - - -class WDRMausIE(InfoExtractor): - _VALID_URL = 'https?://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P[^/?#]+)((?

Sendedatum:\s*([0-9\.]+)

', - webpage, 'air date') - title_str = self._html_search_regex( - r'

(.*?)

', webpage, 'title') - title = '%s - %s' % (title_date, title_str) - upload_date = unified_strdate( - self._html_search_meta('dc.date', webpage)) - - fields = compat_parse_qs(param_code) - video_url = fields['firstVideo'][0] - thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0]) - - formats = [{ - 'format_id': 'rtmp', - 'url': video_url, - }] - - jscode = self._download_webpage( - 'http://www.wdrmaus.de/codebase/js/extended-medien.min.js', - video_id, fatal=False, - note='Downloading URL translation table', - errnote='Could not download URL translation table') - if jscode: - for m in re.finditer( - r"stream:\s*'dslSrc=(?P[^']+)',\s*download:\s*'(?P
[^']+)'\s*\}", - jscode): - if video_url.startswith(m.group('stream')): - http_url = video_url.replace( - m.group('stream'), m.group('dl')) - formats.append({ - 'format_id': 'http', - 'url': http_url, - }) - break - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - } From 3a686853e1739dfc26548cdc09fe89e693e76a9f Mon Sep 17 00:00:00 2001 From: Boris Wachtmeister Date: Thu, 26 May 2016 20:16:33 +0200 Subject: [PATCH 8/8] [WDR] fixed parsing of playlists --- youtube_dl/extractor/wdr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 1af1e996d8..1e729cb7cb 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -72,7 +72,7 @@ class WDRIE(InfoExtractor): }, { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', - 'playlist_mincount': 10, + 'playlist_mincount': 8, 'info_dict': { 'id': 'aktuelle-stunde/aktuelle-stunde-120', }, @@ -121,7 +121,7 @@ class WDRIE(InfoExtractor): entries = [ self.url_result(page_url + href[0], 'WDR') for href in re.findall( - r']+data-extension=' % self._PAGE_REGEX, webpage) ]