From b12cf31bb1493ba461ab8663a63b7f42164617da Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 19 Feb 2018 09:02:23 +0100 Subject: [PATCH] [cbc] add new extractor for olympics.cbc.ca(closes #15535) --- youtube_dl/extractor/cbc.py | 62 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/utils.py | 2 +- 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 9faf402275..3be0c646bb 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -13,6 +14,7 @@ from ..utils import ( xpath_element, xpath_with_ns, find_xpath_attr, + parse_duration, parse_iso8601, parse_age_limit, int_or_none, @@ -359,3 +361,63 @@ class CBCWatchIE(CBCWatchBaseIE): video_id = self._match_id(url) rss = self._call_api('web/browse/' + video_id, video_id) return self._parse_rss_feed(rss) + + +class CBCOlympicsIE(InfoExtractor): + IE_NAME = 'cbc.ca:olympics' + _VALID_URL = r'https?://olympics\.cbc\.ca/video/[^/]+/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._hidden_inputs(webpage)['videoId'] + video_doc = self._download_xml( + 'https://olympics.cbc.ca/videodata/%s.xml' % video_id, video_id) + title = xpath_text(video_doc, 'title', fatal=True) + is_live = xpath_text(video_doc, 'kind') == 'Live' + if is_live: + title = self._live_title(title) + + formats = [] + for video_source in video_doc.findall('videoSources/videoSource'): + uri = xpath_text(video_source, 'uri') + if not uri: + continue + tokenize = self._download_json( + 'https://olympics.cbc.ca/api/api-akamai/tokenize', + video_id, data=json.dumps({ + 'VideoSource': uri, + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Referer': url, + # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js + 'Cookie': '_dvp=TK:C0ObxjerU', # AKAMAI CDN cookie + }, fatal=False) + if not tokenize: + continue + content_url = tokenize['ContentUrl'] + video_source_format = video_source.get('format') + if video_source_format == 'IIS': + formats.extend(self._extract_ism_formats( + content_url, video_id, ism_id=video_source_format, fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + content_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', + m3u8_id=video_source_format, fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': xpath_text(video_doc, 'description'), + 'thumbnail': xpath_text(video_doc, 'thumbnailUrl'), + 'duration': parse_duration(xpath_text(video_doc, 'duration')), + 'formats': formats, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c996de50a7..b2a4893fed 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -162,6 +162,7 @@ from .cbc import ( CBCPlayerIE, CBCWatchVideoIE, CBCWatchIE, + CBCOlympicsIE, ) from .cbs import CBSIE from .cbslocal import CBSLocalIE diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ef44b99a55..7f24cbb049 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -82,7 +82,7 @@ def register_socks_protocols(): compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate',