From 8068296276657c9d888338c2211c112d69de6fc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 15 Apr 2017 21:50:15 +0700 Subject: [PATCH] [streamango] Improve extraction (closes #12643) --- youtube_dl/extractor/streamango.py | 50 ++++++++++++++++++------------ 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py index a4ef06b666..aa4fad162c 100644 --- a/youtube_dl/extractor/streamango.py +++ b/youtube_dl/extractor/streamango.py @@ -1,11 +1,18 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, +) class StreamangoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P.+?)/(?:.+)' + _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4', 'md5': 'e992787515a182f55e38fc97588d802a', @@ -13,7 +20,6 @@ class StreamangoIE(InfoExtractor): 'id': 'clapasobsptpkdfe', 'ext': 'mp4', 'title': '20170315_150006.mp4', - 'url': r're:https://streamango\.com/v/d/clapasobsptpkdfe~[0-9]{10}~(?:[0-9]+\.){3}[0-9]+~.{8}/720', } }, { 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', @@ -21,29 +27,33 @@ class StreamangoIE(InfoExtractor): }] def _real_extract(self, url): - def extract_url(urltype): - return self._search_regex( - r'type\s*:\s*["\']{}["\']\s*,\s*src\s*:\s*["\'](?P.+?)["\'].*'.format(urltype), - webpage, 'video URL', group='url') - video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) - url = 'https:' + extract_url('video/mp4') - dashurl = extract_url(r'application/dash\+xml') - - formats = [{ - 'url': url, - 'ext': 'mp4', - 'width': 1280, - 'height': 720, - 'format_id': 'mp4', - }] - - formats.extend(self._extract_mpd_formats( - dashurl, video_id, mpd_id='dash', fatal=False)) + formats = [] + for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage): + video = self._parse_json( + format_, video_id, transform_source=js_to_json, fatal=False) + if not video: + continue + src = video.get('src') + if not src: + continue + ext = determine_ext(src, default_ext=None) + if video.get('type') == 'application/dash+xml' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': src, + 'ext': ext or 'mp4', + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'tbr': int_or_none(video.get('bitrate')), + }) self._sort_formats(formats) return {