From 6a4f3528c84eb5d4d1c527e83e2d9bfd7639d426 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 11 Apr 2014 20:40:42 +0700 Subject: [PATCH] [firstpost] Fix extraction --- youtube_dl/extractor/firstpost.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/firstpost.py b/youtube_dl/extractor/firstpost.py index 7e3d1afd21..eccd8dde9e 100644 --- a/youtube_dl/extractor/firstpost.py +++ b/youtube_dl/extractor/firstpost.py @@ -6,7 +6,6 @@ from .common import InfoExtractor class FirstpostIE(InfoExtractor): - IE_NAME = 'Firstpost.com' _VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P[0-9]+)\.html' _TEST = { @@ -16,7 +15,6 @@ class FirstpostIE(InfoExtractor): 'id': '1025403', 'ext': 'mp4', 'title': 'India to launch indigenous aircraft carrier INS Vikrant today', - 'description': 'Its flight deck is over twice the size of a football field, its power unit can light up the entire Kochi city and the cabling is enough to cover the distance between here to Delhi.', } } @@ -24,15 +22,26 @@ class FirstpostIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'', - webpage, 'video URL') + data = self._download_xml( + 'http://www.firstpost.com/getvideoxml-%s.xml' % video_id, video_id, + 'Downloading video XML') + + item = data.find('./playlist/item') + thumbnail = item.find('./image').text + title = item.find('./title').text + + formats = [ + { + 'url': details.find('./file').text, + 'format_id': details.find('./label').text.strip(), + 'width': int(details.find('./width').text.strip()), + 'height': int(details.find('./height').text.strip()), + } for details in item.findall('./source/file_details') if details.find('./file').text + ] return { 'id': video_id, - 'url': video_url, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, }