From 48fbb1003d901ef30654db8910b6f617efc49fb4 Mon Sep 17 00:00:00 2001 From: Adam Malcontenti-Wilson Date: Mon, 19 May 2014 21:25:58 +1000 Subject: [PATCH 01/73] [adultswim] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/adultswim.py | 100 ++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 youtube_dl/extractor/adultswim.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3503c76b7c..5f9ae7a3e2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,5 +1,6 @@ from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE +from .adultswim import AdultSwimIE from .aftonbladet import AftonbladetIE from .anitube import AnitubeIE from .aol import AolIE diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py new file mode 100644 index 0000000000..e61916fde9 --- /dev/null +++ b/youtube_dl/extractor/adultswim.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +class AdultSwimIE(InfoExtractor): + _VALID_URL = r'https?://video\.adultswim\.com/(?P.+?)(?:\.html)?(?:\?.*)?(?:#.*)?$' + _TEST = { + 'url': 'http://video.adultswim.com/rick-and-morty/close-rick-counters-of-the-rick-kind.html?x=y#title', + 'md5': '4a90c63a07537ec9383175b330dfeab4', + 'info_dict': { + 'id': '8a250ba1450996e901453d7e9caf02f3', + 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind', + 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', + } + } + + _available_formats = ['150', '640', '3500'] + + _video_extensions = { + '3500': 'flv', + '640': 'mp4', + '150': 'mp4', + 'ipad': 'm3u8', + 'iphone': 'm3u8' + } + _video_dimensions = { + '3500': (1280, 720), + '640': (480, 270), + '150': (320, 180) + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_path = mobj.group('path') + + webpage = self._download_webpage(url, video_path) + episode_id = self._html_search_regex(r'', webpage, 'episode_id') + title = self._html_search_regex(r'', webpage, 'title') + + index_url = 'http://asfix.adultswim.com/asfix-svc/episodeSearch/getEpisodesByIDs?networkName=AS&ids=%s' % episode_id + idoc = self._download_xml(index_url, title, 'Downloading episode index', 'Unable to download episode index') + + episode_el = idoc.find('.//episode') + show_title = episode_el.attrib.get('collectionTitle') + episode_title = episode_el.attrib.get('title') + thumbnail = episode_el.attrib.get('thumbnailUrl') + description = episode_el.find('./description').text.strip() + + entries = [] + segment_els = episode_el.findall('./segments/segment') + + for part_num, segment_el in enumerate(segment_els): + segment_id = segment_el.attrib.get('id') + segment_title = '%s %s part %d' % (show_title, episode_title, part_num + 1) + thumbnail = segment_el.attrib.get('thumbnailUrl') + duration = segment_el.attrib.get('duration') + + segment_url = 'http://asfix.adultswim.com/asfix-svc/episodeservices/getCvpPlaylist?networkName=AS&id=%s' % segment_id + idoc = self._download_xml(segment_url, segment_title, 'Downloading segment information', 'Unable to download segment information') + + formats = [] + file_els = idoc.findall('.//files/file') + + for file_el in file_els: + bitrate = file_el.attrib.get('bitrate') + type = file_el.attrib.get('type') + width, height = self._video_dimensions.get(bitrate, (None, None)) + formats.append({ + 'format_id': '%s-%s' % (bitrate, type), + 'url': file_el.text, + 'ext': self._video_extensions.get(bitrate, 'mp4'), + 'tbr': bitrate, + 'height': height, + 'width': width + }) + + self._sort_formats(formats) + + entries.append({ + 'id': segment_id, + 'title': segment_title, + 'formats': formats, + 'uploader': show_title, + 'thumbnail': thumbnail, + 'duration': duration, + 'description': description + }) + + return { + '_type': 'playlist', + 'id': episode_id, + 'display_id': video_path, + 'entries': entries, + 'title': '%s %s' % (show_title, episode_title), + 'description': description, + 'thumbnail': thumbnail + } From d415299a80c44fe97a0ef914311449ec0581a0cb Mon Sep 17 00:00:00 2001 From: Adam Malcontenti-Wilson Date: Mon, 19 May 2014 22:32:45 +1000 Subject: [PATCH 02/73] [adultswim] Fix tests --- youtube_dl/extractor/adultswim.py | 52 +++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index e61916fde9..ca1bfbdc22 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -9,12 +9,52 @@ class AdultSwimIE(InfoExtractor): _VALID_URL = r'https?://video\.adultswim\.com/(?P.+?)(?:\.html)?(?:\?.*)?(?:#.*)?$' _TEST = { 'url': 'http://video.adultswim.com/rick-and-morty/close-rick-counters-of-the-rick-kind.html?x=y#title', - 'md5': '4a90c63a07537ec9383175b330dfeab4', - 'info_dict': { - 'id': '8a250ba1450996e901453d7e9caf02f3', - 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind', - 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', - } + 'playlist': [ + { + 'md5': '4da359ec73b58df4575cd01a610ba5dc', + 'info_dict': { + 'id': '8a250ba1450996e901453d7f02ca02f5', + 'ext': 'flv', + 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 1', + 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', + 'uploader': 'Rick and Morty', + 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg' + } + }, + { + 'md5': 'ffbdf55af9331c509d95350bd0cc1819', + 'info_dict': { + 'id': '8a250ba1450996e901453d7f4bd102f6', + 'ext': 'flv', + 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 2', + 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', + 'uploader': 'Rick and Morty', + 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg' + } + }, + { + 'md5': 'b92409635540304280b4b6c36bd14a0a', + 'info_dict': { + 'id': '8a250ba1450996e901453d7fa73c02f7', + 'ext': 'flv', + 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 3', + 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', + 'uploader': 'Rick and Morty', + 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg' + } + }, + { + 'md5': 'e8818891d60e47b29cd89d7b0278156d', + 'info_dict': { + 'id': '8a250ba1450996e901453d7fc8ba02f8', + 'ext': 'flv', + 'title': 'Rick and Morty Close Rick-Counters of the Rick Kind part 4', + 'description': 'Rick has a run in with some old associates, resulting in a fallout with Morty. You got any chips, broh?', + 'uploader': 'Rick and Morty', + 'thumbnail': 'http://i.cdn.turner.com/asfix/repository/8a250ba13f865824013fc9db8b6b0400/thumbnail_267549017116827057.jpg' + } + } + ] } _available_formats = ['150', '640', '3500'] From 66aa382eae9342506db64ce3328a009fd3f06d5c Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis Date: Wed, 16 Jul 2014 02:07:20 +0300 Subject: [PATCH 03/73] [sockshare] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/sockshare.py | 77 +++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 youtube_dl/extractor/sockshare.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e49ac3e527..f3575b6c97 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -262,6 +262,7 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) +from .sockshare import SockshareIE from .sohu import SohuIE from .soundcloud import ( SoundcloudIE, diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py new file mode 100644 index 0000000000..cbf2d7abef --- /dev/null +++ b/youtube_dl/extractor/sockshare.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ..utils import ( + ExtractorError, + compat_urllib_parse, + compat_urllib_request, + determine_ext, +) +import re + +from .common import InfoExtractor + + +class SockshareIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P[0-9A-Za-z]+)' + _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.' + _TEST = { + 'url': 'http://www.sockshare.com/file/437BE28B89D799D7', + 'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd', + 'info_dict': { + 'id': '437BE28B89D799D7', + 'title': 'big_buck_bunny_720p_surround.avi', + 'ext': 'avi', + 'thumbnail': 're:^http://.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + url = 'http://sockshare.com/file/%s' % video_id + webpage = self._download_webpage(url, video_id) + + if re.search(self._FILE_DELETED_REGEX, webpage) is not None: + raise ExtractorError(u'Video %s does not exist' % video_id, + expected=True) + + confirm_hash = self._html_search_regex(r'''(?x)(.+)', webpage, 'title') + thumbnail = self._html_search_regex(r' Date: Wed, 16 Jul 2014 18:45:42 +0500 Subject: [PATCH 04/73] [cracked] Add new extractor --- youtube_dl/extractor/__init__.py | 2 ++ youtube_dl/extractor/cracked.py | 46 ++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 youtube_dl/extractor/cracked.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e49ac3e527..78b95c2a5f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -52,6 +52,7 @@ from .cnn import ( from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .condenast import CondeNastIE +from .cracked import CrackedIE from .criterion import CriterionIE from .crunchyroll import CrunchyrollIE from .cspan import CSpanIE @@ -396,6 +397,7 @@ from .youtube import ( YoutubeUserIE, YoutubeWatchLaterIE, ) + from .zdf import ZDFIE diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py new file mode 100644 index 0000000000..37c0f7ffb6 --- /dev/null +++ b/youtube_dl/extractor/cracked.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +class CrackedIE(InfoExtractor): + _VALID_URL = r'http?://.*?\.cracked\.com/video_+(?P.*)_.*' + _TEST = { + 'url': 'http://www.cracked.com/video_18803_4-social-criticisms-hidden-in-sonic-hedgehog-games.html', + + 'info_dict': { + 'id': '18803', + 'ext': 'mp4', + 'title': "4 Social Criticisms Hidden in 'Sonic the Hedgehog' Games | Cracked.com", + 'height': 375, + 'width': 666, + + + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._search_regex(r'(.*?)',webpage,'title') + video_url = self._search_regex(r'var CK_vidSrc = "+(.*)"',webpage,'url') + width = self._search_regex(r'width="(.*?)"',webpage,'width') + height = re.findall(r'height="(.*?)"',webpage)[1] + + + + + return { + 'url':video_url, + 'id': video_id, + 'ext':'mp4', + 'title':title, + 'height':int(height), + 'width':int(width) + + + } \ No newline at end of file From d8894e24a4f123c1fe84112693047e638b7161df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 17 Jul 2014 01:57:38 +0700 Subject: [PATCH 05/73] [rtbf] Fix data video regex --- youtube_dl/extractor/rtbf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 205f8a1676..dce64e1517 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -30,7 +30,7 @@ class RTBFIE(InfoExtractor): page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id) data = json.loads(self._html_search_regex( - r'
Date: Wed, 16 Jul 2014 22:35:09 +0200 Subject: [PATCH 06/73] [comedycentral] Recognize 'full-episodes' urls (fixes #3277) --- youtube_dl/extractor/comedycentral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 8af0abade8..3c0ce7859c 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -15,7 +15,7 @@ from ..utils import ( class ComedyCentralIE(MTVServicesInfoExtractor): _VALID_URL = r'''(?x)https?://(?:www\.)?(comedycentral|cc)\.com/ - (video-clips|episodes|cc-studios|video-collections) + (video-clips|episodes|cc-studios|video-collections|full-episodes) /(?P.*)''' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' From b4c538b02b12bfc1443e724a137aa6d54a02cc9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 16 Jul 2014 22:37:01 +0200 Subject: [PATCH 07/73] [comedycentral] Only recognize the cc.com domain The old comedycentral.com urls redirect to the new urls. --- youtube_dl/extractor/comedycentral.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 3c0ce7859c..c81ce5a96f 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -14,13 +14,13 @@ from ..utils import ( class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?(comedycentral|cc)\.com/ + _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ (video-clips|episodes|cc-studios|video-collections|full-episodes) /(?P<title>.*)''' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TEST = { - 'url': 'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', + 'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', 'md5': 'c4f48e9eda1b16dd10add0744344b6d8', 'info_dict': { 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', From ca14211e93f3800893d5519f370348ba2db84ddc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 17 Jul 2014 09:27:06 +0200 Subject: [PATCH 08/73] [adultswim] Simplify (closes #2952) --- youtube_dl/extractor/adultswim.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index ca1bfbdc22..fdaecdec11 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -57,8 +57,6 @@ class AdultSwimIE(InfoExtractor): ] } - _available_formats = ['150', '640', '3500'] - _video_extensions = { '3500': 'flv', '640': 'mp4', @@ -78,7 +76,7 @@ class AdultSwimIE(InfoExtractor): webpage = self._download_webpage(url, video_path) episode_id = self._html_search_regex(r'<link rel="video_src" href="http://i\.adultswim\.com/adultswim/adultswimtv/tools/swf/viralplayer.swf\?id=([0-9a-f]+?)"\s*/?\s*>', webpage, 'episode_id') - title = self._html_search_regex(r'<meta property="og:title" content="\s*(.*?)\s*"\s*/?\s*>', webpage, 'title') + title = self._og_search_title(webpage) index_url = 'http://asfix.adultswim.com/asfix-svc/episodeSearch/getEpisodesByIDs?networkName=AS&ids=%s' % episode_id idoc = self._download_xml(index_url, title, 'Downloading episode index', 'Unable to download episode index') From d9222264a8b4adcbe16286d61404acf67e0dcfa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 17 Jul 2014 09:31:48 +0200 Subject: [PATCH 09/73] [adultswim] The bitrate must be an integer or None (reported in #2952) --- youtube_dl/extractor/adultswim.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index fdaecdec11..a00bfcb35f 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -110,7 +110,8 @@ class AdultSwimIE(InfoExtractor): 'format_id': '%s-%s' % (bitrate, type), 'url': file_el.text, 'ext': self._video_extensions.get(bitrate, 'mp4'), - 'tbr': bitrate, + # The bitrate may not be a number (for example: 'iphone') + 'tbr': int(bitrate) if bitrate.isdigit() else None, 'height': height, 'width': width }) From 74aa18f68ffe5a721fae1149193c8b6401076d63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 17 Jul 2014 10:07:51 +0200 Subject: [PATCH 10/73] [dfb] Add extractor (closes #3280) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/dfb.py | 44 ++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 youtube_dl/extractor/dfb.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3e3b5d44ae..03d2e446d4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -63,6 +63,7 @@ from .dailymotion import ( DailymotionUserIE, ) from .daum import DaumIE +from .dfb import DFBIE from .dotsub import DotsubIE from .dreisat import DreiSatIE from .drtv import DRTVIE diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py new file mode 100644 index 0000000000..cb8e068224 --- /dev/null +++ b/youtube_dl/extractor/dfb.py @@ -0,0 +1,44 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class DFBIE(InfoExtractor): + IE_NAME = 'tv.dfb.de' + _VALID_URL = r'https?://tv\.dfb\.de/video/[^/]+/(?P<id>\d+)' + + _TEST = { + 'url': 'http://tv.dfb.de/video/highlights-des-empfangs-in-berlin/9070/', + # The md5 is different each time + 'info_dict': { + 'id': '9070', + 'ext': 'flv', + 'title': 'Highlights des Empfangs in Berlin', + 'upload_date': '20140716', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + player_info = self._download_xml( + 'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id, + video_id) + video_info = player_info.find('video') + + f4m_info = self._download_xml(video_info.find('url').text, video_id) + token_el = f4m_info.find('token') + manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' + + return { + 'id': video_id, + 'title': video_info.find('title').text, + 'url': manifest_url, + 'ext': 'flv', + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': ''.join(video_info.find('time_date').text.split('.')[::-1]), + } From 530ed178b73b02087362d0b29fb158b28a37d657 Mon Sep 17 00:00:00 2001 From: MikeCol <MikeCol@gmx.net> Date: Thu, 17 Jul 2014 11:17:27 +0200 Subject: [PATCH 11/73] Redtube changed player config, new place to get thumb URL --- youtube_dl/extractor/redtube.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 4295cf93a7..d1e12dd8d5 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -35,9 +35,7 @@ class RedTubeIE(InfoExtractor): r'<h1 class="videoTitle[^"]*">(.+?)</h1>', webpage, u'title') - video_thumbnail = self._html_search_regex( - r'playerInnerHTML.+?<img\s+src="(.+?)"', - webpage, u'thumbnail', fatal=False) + video_thumbnail = self._og_search_thumbnail(webpage) # No self-labeling, but they describe themselves as # "Home of Videos Porno" From cf01013161621d2e8d5ff107588617a1f09db53d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 17 Jul 2014 16:28:30 +0200 Subject: [PATCH 12/73] [youtube] Find more swf players (Closes #3270, refer #3271) --- youtube_dl/extractor/youtube.py | 48 +++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6123e12564..5449df8e0d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -347,8 +347,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): self.to_screen(u'RTMP download detected') def _extract_signature_function(self, video_id, player_url, slen): - id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$', - player_url) + id_m = re.match( + r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3)?\.(?P<ext>[a-z]+)$', + player_url) player_type = id_m.group('ext') player_id = id_m.group('id') @@ -1220,30 +1221,37 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url += '&signature=' + url_data['sig'][0] elif 's' in url_data: encrypted_sig = url_data['s'][0] - if self._downloader.params.get('verbose'): - if age_gate: - if player_url is None: - player_version = 'unknown' - else: - player_version = self._search_regex( - r'-(.+)\.swf$', player_url, - u'flash player', fatal=False) - player_desc = 'flash player %s' % player_version - else: - player_version = self._search_regex( - r'html5player-(.+?)\.js', video_webpage, - 'html5 player', fatal=False) - player_desc = u'html5 player %s' % player_version - - parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.')) - self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % - (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) if not age_gate: jsplayer_url_json = self._search_regex( r'"assets":.+?"js":\s*("[^"]+")', video_webpage, u'JS player URL') player_url = json.loads(jsplayer_url_json) + if player_url is None: + player_url_json = self._search_regex( + r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', + video_webpage, u'age gate player URL') + player_url = json.loads(player_url_json) + + if self._downloader.params.get('verbose'): + if player_url is None: + player_version = 'unknown' + player_desc = 'unknown' + else: + if player_url.endswith('swf'): + player_version = self._search_regex( + r'-(.+)\.swf$', player_url, + u'flash player', fatal=False) + player_desc = 'flash player %s' % player_version + else: + player_version = self._search_regex( + r'html5player-(.+?)\.js', video_webpage, + 'html5 player', fatal=False) + player_desc = u'html5 player %s' % player_version + + parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.')) + self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % + (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) signature = self._decrypt_signature( encrypted_sig, video_id, player_url, age_gate) From 6e74521d98e4824e56d6e4db902de07a75fa867f Mon Sep 17 00:00:00 2001 From: Reventl0v <grunblattr@gmail.com> Date: Thu, 17 Jul 2014 21:08:43 +0200 Subject: [PATCH 13/73] Fix the url in the INSTALLATION section --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bc5e0f76df..fb2f776c9a 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ To install it right away for all UNIX users (Linux, OS X, etc.), type: If you do not have curl, you can alternatively use a recent wget: - sudo wget https://yt-dl.org/downloads/2014.05.13/youtube-dl -O /usr/local/bin/youtube-dl + sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl sudo chmod a+x /usr/local/bin/youtube-dl Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29). From 66149e3f2b57d110045862bfbc19b3efbb50d152 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 17 Jul 2014 22:29:03 +0200 Subject: [PATCH 14/73] [npo] Fix the json extraction (fixes #3282) The comment in the javascript file is not always the same. --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index fbcbe1f40c..12e85a716f 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -32,7 +32,7 @@ class NPOIE(InfoExtractor): 'http://e.omroep.nl/metadata/aflevering/%s' % video_id, video_id, # We have to remove the javascript callback - transform_source=lambda j: re.sub(r'parseMetadata\((.*?)\);\n//epc', r'\1', j) + transform_source=lambda j: re.sub(r'parseMetadata\((.*?)\);\n//.*$', r'\1', j) ) token_page = self._download_webpage( 'http://ida.omroep.nl/npoplayer/i.js', From 3fbd27f73ef1f3507443d0e0907a4c0b19a63ed8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 17 Jul 2014 23:22:49 +0200 Subject: [PATCH 15/73] [youtube] SWF parser: Add opcode 86 Yes, I know we need 96, but an implementation of 86 could help avoid a similar issue. --- youtube_dl/extractor/youtube.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5449df8e0d..c2c4fd7e88 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -799,6 +799,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): raise NotImplementedError( u'Unsupported (void) property %r on %r' % (mname, obj)) + elif opcode == 86: # newarray + arg_count = u30(coder) + arr = [] + for i in range(arg_count): + arr.append(stack.pop()) + arr = arr[::-1] + stack.append(arr) elif opcode == 93: # findpropstrict index = u30(coder) mname = multinames[index] From 5dc3552d85ac2b3723d0548bbe44996d50891cf2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 18 Jul 2014 00:54:17 +0200 Subject: [PATCH 16/73] [youtube] Add support for classes in swf parser --- youtube_dl/extractor/youtube.py | 99 ++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c2c4fd7e88..16f4a047d6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -507,6 +507,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): v = - ((v ^ 0xffffffff) + 1) return v + def s24(reader): + bs = reader.read(3) + assert len(bs) == 3 + first_byte = b'\xff' if (ord(bs[0:1]) >= 0x80) else b'\x00' + return struct.unpack('!i', first_byte + bs) + def read_string(reader=None): if reader is None: reader = code_reader @@ -647,16 +653,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return methods + class AVMClass(object): + def __init__(self, name_idx): + self.name_idx = name_idx + self.method_names = {} + self.method_idxs = {} + self.methods = {} + self.method_pyfunctions = {} + self.variables = {} + + @property + def name(self): + return multinames[self.name_idx] + # Classes - TARGET_CLASSNAME = u'SignatureDecipher' - searched_idx = multinames.index(TARGET_CLASSNAME) - searched_class_id = None class_count = u30() + classes = [] for class_id in range(class_count): name_idx = u30() - if name_idx == searched_idx: - # We found the class we're looking for! - searched_class_id = class_id + classes.append(AVMClass(name_idx)) u30() # super_name idx flags = read_byte() if flags & 0x08 != 0: # Protected namespace is present @@ -668,23 +683,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): trait_count = u30() for _c2 in range(trait_count): parse_traits_info() + assert len(classes) == class_count - if searched_class_id is None: + TARGET_CLASSNAME = u'SignatureDecipher' + searched_class = next( + c for c in classes if c.name == TARGET_CLASSNAME) + if searched_class is None: raise ExtractorError(u'Target class %r not found' % TARGET_CLASSNAME) - method_names = {} - method_idxs = {} - for class_id in range(class_count): + for avm_class in classes: u30() # cinit trait_count = u30() for _c2 in range(trait_count): trait_methods = parse_traits_info() - if class_id == searched_class_id: - method_names.update(trait_methods.items()) - method_idxs.update(dict( - (idx, name) - for name, idx in trait_methods.items())) + avm_class.method_names.update(trait_methods.items()) + avm_class.method_idxs.update(dict( + (idx, name) + for name, idx in trait_methods.items())) # Scripts script_count = u30() @@ -697,7 +713,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Method bodies method_body_count = u30() Method = collections.namedtuple('Method', ['code', 'local_count']) - methods = {} for _c in range(method_body_count): method_idx = u30() u30() # max_stack @@ -706,9 +721,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u30() # max_scope_depth code_length = u30() code = read_bytes(code_length) - if method_idx in method_idxs: - m = Method(code, local_count) - methods[method_idxs[method_idx]] = m + for avm_class in classes: + if method_idx in avm_class.method_idxs: + m = Method(code, local_count) + avm_class.methods[avm_class.method_idxs[method_idx]] = m exception_count = u30() for _c2 in range(exception_count): u30() # from @@ -721,16 +737,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): parse_traits_info() assert p + code_reader.tell() == len(code_tag) - assert len(methods) == len(method_idxs) - method_pyfunctions = {} - - def extract_function(func_name): - if func_name in method_pyfunctions: - return method_pyfunctions[func_name] - if func_name not in methods: + def extract_function(avm_class, func_name): + if func_name in avm_class.method_pyfunctions: + return avm_class.method_pyfunctions[func_name] + if func_name not in avm_class.methods: raise ExtractorError(u'Cannot find function %r' % func_name) - m = methods[func_name] + m = avm_class.methods[func_name] def resfunc(args): registers = ['(this)'] + list(args) + [None] * m.local_count @@ -738,7 +751,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): coder = io.BytesIO(m.code) while True: opcode = struct.unpack('!B', coder.read(1))[0] - if opcode == 36: # pushbyte + if opcode == 17: # iftrue + offset = s24(coder) + value = stack.pop() + if value: + coder.seek(coder.tell() + offset) + elif opcode == 36: # pushbyte v = struct.unpack('!B', coder.read(1))[0] stack.append(v) elif opcode == 44: # pushstring @@ -776,8 +794,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): assert isinstance(obj, list) res = args[0].join(obj) stack.append(res) - elif mname in method_pyfunctions: - stack.append(method_pyfunctions[mname](args)) + elif mname in avm_class.method_pyfunctions: + stack.append(avm_class.method_pyfunctions[mname](args)) else: raise NotImplementedError( u'Unsupported property %r on %r' @@ -809,7 +827,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): elif opcode == 93: # findpropstrict index = u30(coder) mname = multinames[index] - res = extract_function(mname) + res = extract_function(avm_class, mname) + stack.append(res) + elif opcode == 94: # findproperty + index = u30(coder) + mname = multinames[index] + res = avm_class.variables.get(mname) + stack.append(res) + elif opcode == 96: # getlex + index = u30(coder) + mname = multinames[index] + res = avm_class.variables.get(mname) stack.append(res) elif opcode == 97: # setproperty index = u30(coder) @@ -848,6 +876,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): value1 = stack.pop() res = value1 % value2 stack.append(res) + elif opcode == 175: # greaterequals + value2 = stack.pop() + value1 = stack.pop() + result = value1 >= value2 + stack.append(result) elif opcode == 208: # getlocal_0 stack.append(registers[0]) elif opcode == 209: # getlocal_1 @@ -864,10 +897,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): raise NotImplementedError( u'Unsupported opcode %d' % opcode) - method_pyfunctions[func_name] = resfunc + avm_class.method_pyfunctions[func_name] = resfunc return resfunc - initial_function = extract_function(u'decipher') + initial_function = extract_function(searched_class, u'decipher') return lambda s: initial_function([s]) def _decrypt_signature(self, s, video_id, player_url, age_gate=False): From 5425626790a46f9b5bdecf4e33bb254c4c2423ea Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 18 Jul 2014 10:24:28 +0200 Subject: [PATCH 17/73] [youtube] Move swfinterp into its own file --- test/test_youtube_signature.py | 12 +- youtube_dl/extractor/youtube.py | 454 +--------------------------- youtube_dl/swfinterp.py | 503 ++++++++++++++++++++++++++++++++ 3 files changed, 516 insertions(+), 453 deletions(-) create mode 100644 youtube_dl/swfinterp.py diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index d955339594..e443e0be88 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -45,6 +45,12 @@ _TESTS = [ u'2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', u'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', ), + ( + u'http://s.ytimg.com/yts/swfbin/player-vfl5vIhK2/watch_as3.swf', + u'swf', + 86, + u'23456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!?#$%&\'()*+,-./:;<=>"' + ), ] @@ -57,12 +63,12 @@ class TestSignature(unittest.TestCase): def make_tfunc(url, stype, sig_input, expected_sig): - basename = url.rpartition('/')[2] - m = re.match(r'.*-([a-zA-Z0-9_-]+)\.[a-z]+$', basename) - assert m, '%r should follow URL format' % basename + m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3)?\.[a-z]+$', url) + assert m, '%r should follow URL format' % url test_id = m.group(1) def test_func(self): + basename = 'player-%s.%s' % (test_id, stype) fn = os.path.join(self.TESTDATA_DIR, basename) if not os.path.exists(fn): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 16f4a047d6..623056bd96 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -14,6 +14,7 @@ import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor from ..jsinterp import JSInterpreter +from ..swfinterp import SWFInterpreter from ..utils import ( compat_chr, compat_parse_qs, @@ -450,457 +451,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return lambda s: initial_function([s]) def _parse_sig_swf(self, file_contents): - if file_contents[1:3] != b'WS': - raise ExtractorError( - u'Not an SWF file; header is %r' % file_contents[:3]) - if file_contents[:1] == b'C': - content = zlib.decompress(file_contents[8:]) - else: - raise NotImplementedError(u'Unsupported compression format %r' % - file_contents[:1]) - - def extract_tags(content): - pos = 0 - while pos < len(content): - header16 = struct.unpack('<H', content[pos:pos+2])[0] - pos += 2 - tag_code = header16 >> 6 - tag_len = header16 & 0x3f - if tag_len == 0x3f: - tag_len = struct.unpack('<I', content[pos:pos+4])[0] - pos += 4 - assert pos+tag_len <= len(content) - yield (tag_code, content[pos:pos+tag_len]) - pos += tag_len - - code_tag = next(tag - for tag_code, tag in extract_tags(content) - if tag_code == 82) - p = code_tag.index(b'\0', 4) + 1 - code_reader = io.BytesIO(code_tag[p:]) - - # Parse ABC (AVM2 ByteCode) - def read_int(reader=None): - if reader is None: - reader = code_reader - res = 0 - shift = 0 - for _ in range(5): - buf = reader.read(1) - assert len(buf) == 1 - b = struct.unpack('<B', buf)[0] - res = res | ((b & 0x7f) << shift) - if b & 0x80 == 0: - break - shift += 7 - return res - - def u30(reader=None): - res = read_int(reader) - assert res & 0xf0000000 == 0 - return res - u32 = read_int - - def s32(reader=None): - v = read_int(reader) - if v & 0x80000000 != 0: - v = - ((v ^ 0xffffffff) + 1) - return v - - def s24(reader): - bs = reader.read(3) - assert len(bs) == 3 - first_byte = b'\xff' if (ord(bs[0:1]) >= 0x80) else b'\x00' - return struct.unpack('!i', first_byte + bs) - - def read_string(reader=None): - if reader is None: - reader = code_reader - slen = u30(reader) - resb = reader.read(slen) - assert len(resb) == slen - return resb.decode('utf-8') - - def read_bytes(count, reader=None): - if reader is None: - reader = code_reader - resb = reader.read(count) - assert len(resb) == count - return resb - - def read_byte(reader=None): - resb = read_bytes(1, reader=reader) - res = struct.unpack('<B', resb)[0] - return res - - # minor_version + major_version - read_bytes(2 + 2) - - # Constant pool - int_count = u30() - for _c in range(1, int_count): - s32() - uint_count = u30() - for _c in range(1, uint_count): - u32() - double_count = u30() - read_bytes((double_count-1) * 8) - string_count = u30() - constant_strings = [u''] - for _c in range(1, string_count): - s = read_string() - constant_strings.append(s) - namespace_count = u30() - for _c in range(1, namespace_count): - read_bytes(1) # kind - u30() # name - ns_set_count = u30() - for _c in range(1, ns_set_count): - count = u30() - for _c2 in range(count): - u30() - multiname_count = u30() - MULTINAME_SIZES = { - 0x07: 2, # QName - 0x0d: 2, # QNameA - 0x0f: 1, # RTQName - 0x10: 1, # RTQNameA - 0x11: 0, # RTQNameL - 0x12: 0, # RTQNameLA - 0x09: 2, # Multiname - 0x0e: 2, # MultinameA - 0x1b: 1, # MultinameL - 0x1c: 1, # MultinameLA - } - multinames = [u''] - for _c in range(1, multiname_count): - kind = u30() - assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind - if kind == 0x07: - u30() # namespace_idx - name_idx = u30() - multinames.append(constant_strings[name_idx]) - else: - multinames.append('[MULTINAME kind: %d]' % kind) - for _c2 in range(MULTINAME_SIZES[kind]): - u30() - - # Methods - method_count = u30() - MethodInfo = collections.namedtuple( - 'MethodInfo', - ['NEED_ARGUMENTS', 'NEED_REST']) - method_infos = [] - for method_id in range(method_count): - param_count = u30() - u30() # return type - for _ in range(param_count): - u30() # param type - u30() # name index (always 0 for youtube) - flags = read_byte() - if flags & 0x08 != 0: - # Options present - option_count = u30() - for c in range(option_count): - u30() # val - read_bytes(1) # kind - if flags & 0x80 != 0: - # Param names present - for _ in range(param_count): - u30() # param name - mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0) - method_infos.append(mi) - - # Metadata - metadata_count = u30() - for _c in range(metadata_count): - u30() # name - item_count = u30() - for _c2 in range(item_count): - u30() # key - u30() # value - - def parse_traits_info(): - trait_name_idx = u30() - kind_full = read_byte() - kind = kind_full & 0x0f - attrs = kind_full >> 4 - methods = {} - if kind in [0x00, 0x06]: # Slot or Const - u30() # Slot id - u30() # type_name_idx - vindex = u30() - if vindex != 0: - read_byte() # vkind - elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter - u30() # disp_id - method_idx = u30() - methods[multinames[trait_name_idx]] = method_idx - elif kind == 0x04: # Class - u30() # slot_id - u30() # classi - elif kind == 0x05: # Function - u30() # slot_id - function_idx = u30() - methods[function_idx] = multinames[trait_name_idx] - else: - raise ExtractorError(u'Unsupported trait kind %d' % kind) - - if attrs & 0x4 != 0: # Metadata present - metadata_count = u30() - for _c3 in range(metadata_count): - u30() # metadata index - - return methods - - class AVMClass(object): - def __init__(self, name_idx): - self.name_idx = name_idx - self.method_names = {} - self.method_idxs = {} - self.methods = {} - self.method_pyfunctions = {} - self.variables = {} - - @property - def name(self): - return multinames[self.name_idx] - - # Classes - class_count = u30() - classes = [] - for class_id in range(class_count): - name_idx = u30() - classes.append(AVMClass(name_idx)) - u30() # super_name idx - flags = read_byte() - if flags & 0x08 != 0: # Protected namespace is present - u30() # protected_ns_idx - intrf_count = u30() - for _c2 in range(intrf_count): - u30() - u30() # iinit - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - assert len(classes) == class_count - + swfi = SWFInterpreter(file_contents) TARGET_CLASSNAME = u'SignatureDecipher' - searched_class = next( - c for c in classes if c.name == TARGET_CLASSNAME) - if searched_class is None: - raise ExtractorError(u'Target class %r not found' % - TARGET_CLASSNAME) - - for avm_class in classes: - u30() # cinit - trait_count = u30() - for _c2 in range(trait_count): - trait_methods = parse_traits_info() - avm_class.method_names.update(trait_methods.items()) - avm_class.method_idxs.update(dict( - (idx, name) - for name, idx in trait_methods.items())) - - # Scripts - script_count = u30() - for _c in range(script_count): - u30() # init - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - - # Method bodies - method_body_count = u30() - Method = collections.namedtuple('Method', ['code', 'local_count']) - for _c in range(method_body_count): - method_idx = u30() - u30() # max_stack - local_count = u30() - u30() # init_scope_depth - u30() # max_scope_depth - code_length = u30() - code = read_bytes(code_length) - for avm_class in classes: - if method_idx in avm_class.method_idxs: - m = Method(code, local_count) - avm_class.methods[avm_class.method_idxs[method_idx]] = m - exception_count = u30() - for _c2 in range(exception_count): - u30() # from - u30() # to - u30() # target - u30() # exc_type - u30() # var_name - trait_count = u30() - for _c2 in range(trait_count): - parse_traits_info() - - assert p + code_reader.tell() == len(code_tag) - - def extract_function(avm_class, func_name): - if func_name in avm_class.method_pyfunctions: - return avm_class.method_pyfunctions[func_name] - if func_name not in avm_class.methods: - raise ExtractorError(u'Cannot find function %r' % func_name) - m = avm_class.methods[func_name] - - def resfunc(args): - registers = ['(this)'] + list(args) + [None] * m.local_count - stack = [] - coder = io.BytesIO(m.code) - while True: - opcode = struct.unpack('!B', coder.read(1))[0] - if opcode == 17: # iftrue - offset = s24(coder) - value = stack.pop() - if value: - coder.seek(coder.tell() + offset) - elif opcode == 36: # pushbyte - v = struct.unpack('!B', coder.read(1))[0] - stack.append(v) - elif opcode == 44: # pushstring - idx = u30(coder) - stack.append(constant_strings[idx]) - elif opcode == 48: # pushscope - # We don't implement the scope register, so we'll just - # ignore the popped value - stack.pop() - elif opcode == 70: # callproperty - index = u30(coder) - mname = multinames[index] - arg_count = u30(coder) - args = list(reversed( - [stack.pop() for _ in range(arg_count)])) - obj = stack.pop() - if mname == u'split': - assert len(args) == 1 - assert isinstance(args[0], compat_str) - assert isinstance(obj, compat_str) - if args[0] == u'': - res = list(obj) - else: - res = obj.split(args[0]) - stack.append(res) - elif mname == u'slice': - assert len(args) == 1 - assert isinstance(args[0], int) - assert isinstance(obj, list) - res = obj[args[0]:] - stack.append(res) - elif mname == u'join': - assert len(args) == 1 - assert isinstance(args[0], compat_str) - assert isinstance(obj, list) - res = args[0].join(obj) - stack.append(res) - elif mname in avm_class.method_pyfunctions: - stack.append(avm_class.method_pyfunctions[mname](args)) - else: - raise NotImplementedError( - u'Unsupported property %r on %r' - % (mname, obj)) - elif opcode == 72: # returnvalue - res = stack.pop() - return res - elif opcode == 79: # callpropvoid - index = u30(coder) - mname = multinames[index] - arg_count = u30(coder) - args = list(reversed( - [stack.pop() for _ in range(arg_count)])) - obj = stack.pop() - if mname == u'reverse': - assert isinstance(obj, list) - obj.reverse() - else: - raise NotImplementedError( - u'Unsupported (void) property %r on %r' - % (mname, obj)) - elif opcode == 86: # newarray - arg_count = u30(coder) - arr = [] - for i in range(arg_count): - arr.append(stack.pop()) - arr = arr[::-1] - stack.append(arr) - elif opcode == 93: # findpropstrict - index = u30(coder) - mname = multinames[index] - res = extract_function(avm_class, mname) - stack.append(res) - elif opcode == 94: # findproperty - index = u30(coder) - mname = multinames[index] - res = avm_class.variables.get(mname) - stack.append(res) - elif opcode == 96: # getlex - index = u30(coder) - mname = multinames[index] - res = avm_class.variables.get(mname) - stack.append(res) - elif opcode == 97: # setproperty - index = u30(coder) - value = stack.pop() - idx = stack.pop() - obj = stack.pop() - assert isinstance(obj, list) - assert isinstance(idx, int) - obj[idx] = value - elif opcode == 98: # getlocal - index = u30(coder) - stack.append(registers[index]) - elif opcode == 99: # setlocal - index = u30(coder) - value = stack.pop() - registers[index] = value - elif opcode == 102: # getproperty - index = u30(coder) - pname = multinames[index] - if pname == u'length': - obj = stack.pop() - assert isinstance(obj, list) - stack.append(len(obj)) - else: # Assume attribute access - idx = stack.pop() - assert isinstance(idx, int) - obj = stack.pop() - assert isinstance(obj, list) - stack.append(obj[idx]) - elif opcode == 128: # coerce - u30(coder) - elif opcode == 133: # coerce_s - assert isinstance(stack[-1], (type(None), compat_str)) - elif opcode == 164: # modulo - value2 = stack.pop() - value1 = stack.pop() - res = value1 % value2 - stack.append(res) - elif opcode == 175: # greaterequals - value2 = stack.pop() - value1 = stack.pop() - result = value1 >= value2 - stack.append(result) - elif opcode == 208: # getlocal_0 - stack.append(registers[0]) - elif opcode == 209: # getlocal_1 - stack.append(registers[1]) - elif opcode == 210: # getlocal_2 - stack.append(registers[2]) - elif opcode == 211: # getlocal_3 - stack.append(registers[3]) - elif opcode == 214: # setlocal_2 - registers[2] = stack.pop() - elif opcode == 215: # setlocal_3 - registers[3] = stack.pop() - else: - raise NotImplementedError( - u'Unsupported opcode %d' % opcode) - - avm_class.method_pyfunctions[func_name] = resfunc - return resfunc - - initial_function = extract_function(searched_class, u'decipher') + searched_class = swfi.extract_class(TARGET_CLASSNAME) + initial_function = swfi.extract_function(searched_class, u'decipher') return lambda s: initial_function([s]) def _decrypt_signature(self, s, video_id, player_url, age_gate=False): diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py new file mode 100644 index 0000000000..1cd2921386 --- /dev/null +++ b/youtube_dl/swfinterp.py @@ -0,0 +1,503 @@ +from __future__ import unicode_literals + +import collections +import io +import struct +import zlib + +from .utils import ExtractorError + + +def _extract_tags(content): + pos = 0 + while pos < len(content): + header16 = struct.unpack('<H', content[pos:pos + 2])[0] + pos += 2 + tag_code = header16 >> 6 + tag_len = header16 & 0x3f + if tag_len == 0x3f: + tag_len = struct.unpack('<I', content[pos:pos + 4])[0] + pos += 4 + assert pos + tag_len <= len(content) + yield (tag_code, content[pos:pos + tag_len]) + pos += tag_len + + +class _AVMClass_Object(object): + def __init__(self, avm_class): + self.avm_class = avm_class + + def __repr__(self): + return '%s#%x' % (self.avm_class.name, id(self)) + + +class _AVMClass(object): + def __init__(self, name_idx, name): + self.name_idx = name_idx + self.name = name + self.method_names = {} + self.method_idxs = {} + self.methods = {} + self.method_pyfunctions = {} + self.variables = {} + + def make_object(self): + return _AVMClass_Object(self) + + +def _read_int(reader): + res = 0 + shift = 0 + for _ in range(5): + buf = reader.read(1) + assert len(buf) == 1 + b = struct.unpack('<B', buf)[0] + res = res | ((b & 0x7f) << shift) + if b & 0x80 == 0: + break + shift += 7 + return res + + +def _u30(reader): + res = _read_int(reader) + assert res & 0xf0000000 == 0 + return res +u32 = _read_int + + +def _s32(reader): + v = _read_int(reader) + if v & 0x80000000 != 0: + v = - ((v ^ 0xffffffff) + 1) + return v + + +def _s24(reader): + bs = reader.read(3) + assert len(bs) == 3 + first_byte = b'\xff' if (ord(bs[0:1]) >= 0x80) else b'\x00' + return struct.unpack('!i', first_byte + bs) + + +def _read_string(reader): + slen = _u30(reader) + resb = reader.read(slen) + assert len(resb) == slen + return resb.decode('utf-8') + + +def _read_bytes(count, reader): + if reader is None: + reader = code_reader + resb = reader.read(count) + assert len(resb) == count + return resb + + +def _read_byte(reader): + resb = _read_bytes(1, reader=reader) + res = struct.unpack('<B', resb)[0] + return res + + +class SWFInterpreter(object): + def __init__(self, file_contents): + if file_contents[1:3] != b'WS': + raise ExtractorError( + 'Not an SWF file; header is %r' % file_contents[:3]) + if file_contents[:1] == b'C': + content = zlib.decompress(file_contents[8:]) + else: + raise NotImplementedError( + 'Unsupported compression format %r' % + file_contents[:1]) + + code_tag = next(tag + for tag_code, tag in _extract_tags(content) + if tag_code == 82) + p = code_tag.index(b'\0', 4) + 1 + code_reader = io.BytesIO(code_tag[p:]) + + # Parse ABC (AVM2 ByteCode) + + # Define a couple convenience methods + u30 = lambda *args: _u30(*args, reader=code_reader) + s32 = lambda *args: _s32(*args, reader=code_reader) + u32 = lambda *args: _u32(*args, reader=code_reader) + read_bytes = lambda *args: _read_bytes(*args, reader=code_reader) + read_byte = lambda *args: _read_byte(*args, reader=code_reader) + + # minor_version + major_version + read_bytes(2 + 2) + + # Constant pool + int_count = u30() + for _c in range(1, int_count): + s32() + uint_count = u30() + for _c in range(1, uint_count): + u32() + double_count = u30() + read_bytes((double_count - 1) * 8) + string_count = u30() + constant_strings = [''] + for _c in range(1, string_count): + s = _read_string(code_reader) + constant_strings.append(s) + namespace_count = u30() + for _c in range(1, namespace_count): + read_bytes(1) # kind + u30() # name + ns_set_count = u30() + for _c in range(1, ns_set_count): + count = u30() + for _c2 in range(count): + u30() + multiname_count = u30() + MULTINAME_SIZES = { + 0x07: 2, # QName + 0x0d: 2, # QNameA + 0x0f: 1, # RTQName + 0x10: 1, # RTQNameA + 0x11: 0, # RTQNameL + 0x12: 0, # RTQNameLA + 0x09: 2, # Multiname + 0x0e: 2, # MultinameA + 0x1b: 1, # MultinameL + 0x1c: 1, # MultinameLA + } + self.multinames = [''] + for _c in range(1, multiname_count): + kind = u30() + assert kind in MULTINAME_SIZES, 'Invalid multiname kind %r' % kind + if kind == 0x07: + u30() # namespace_idx + name_idx = u30() + self.multinames.append(constant_strings[name_idx]) + else: + self.multinames.append('[MULTINAME kind: %d]' % kind) + for _c2 in range(MULTINAME_SIZES[kind]): + u30() + + # Methods + method_count = u30() + MethodInfo = collections.namedtuple( + 'MethodInfo', + ['NEED_ARGUMENTS', 'NEED_REST']) + method_infos = [] + for method_id in range(method_count): + param_count = u30() + u30() # return type + for _ in range(param_count): + u30() # param type + u30() # name index (always 0 for youtube) + flags = read_byte() + if flags & 0x08 != 0: + # Options present + option_count = u30() + for c in range(option_count): + u30() # val + read_bytes(1) # kind + if flags & 0x80 != 0: + # Param names present + for _ in range(param_count): + u30() # param name + mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0) + method_infos.append(mi) + + # Metadata + metadata_count = u30() + for _c in range(metadata_count): + u30() # name + item_count = u30() + for _c2 in range(item_count): + u30() # key + u30() # value + + def parse_traits_info(): + trait_name_idx = u30() + kind_full = read_byte() + kind = kind_full & 0x0f + attrs = kind_full >> 4 + methods = {} + if kind in [0x00, 0x06]: # Slot or Const + u30() # Slot id + u30() # type_name_idx + vindex = u30() + if vindex != 0: + read_byte() # vkind + elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter + u30() # disp_id + method_idx = u30() + methods[self.multinames[trait_name_idx]] = method_idx + elif kind == 0x04: # Class + u30() # slot_id + u30() # classi + elif kind == 0x05: # Function + u30() # slot_id + function_idx = u30() + methods[function_idx] = self.multinames[trait_name_idx] + else: + raise ExtractorError('Unsupported trait kind %d' % kind) + + if attrs & 0x4 != 0: # Metadata present + metadata_count = u30() + for _c3 in range(metadata_count): + u30() # metadata index + + return methods + + # Classes + class_count = u30() + classes = [] + for class_id in range(class_count): + name_idx = u30() + classes.append(_AVMClass(name_idx, self.multinames[name_idx])) + u30() # super_name idx + flags = read_byte() + if flags & 0x08 != 0: # Protected namespace is present + u30() # protected_ns_idx + intrf_count = u30() + for _c2 in range(intrf_count): + u30() + u30() # iinit + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + assert len(classes) == class_count + self._classes_by_name = dict((c.name, c) for c in classes) + + for avm_class in classes: + u30() # cinit + trait_count = u30() + for _c2 in range(trait_count): + trait_methods = parse_traits_info() + avm_class.method_names.update(trait_methods.items()) + avm_class.method_idxs.update(dict( + (idx, name) + for name, idx in trait_methods.items())) + + # Scripts + script_count = u30() + for _c in range(script_count): + u30() # init + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + # Method bodies + method_body_count = u30() + Method = collections.namedtuple('Method', ['code', 'local_count']) + for _c in range(method_body_count): + method_idx = u30() + u30() # max_stack + local_count = u30() + u30() # init_scope_depth + u30() # max_scope_depth + code_length = u30() + code = read_bytes(code_length) + for avm_class in classes: + if method_idx in avm_class.method_idxs: + m = Method(code, local_count) + avm_class.methods[avm_class.method_idxs[method_idx]] = m + exception_count = u30() + for _c2 in range(exception_count): + u30() # from + u30() # to + u30() # target + u30() # exc_type + u30() # var_name + trait_count = u30() + for _c2 in range(trait_count): + parse_traits_info() + + assert p + code_reader.tell() == len(code_tag) + + def extract_class(self, class_name): + try: + return self._classes_by_name[class_name] + except KeyError: + raise ExtractorError('Class %r not found' % class_name) + + def extract_function(self, avm_class, func_name): + if func_name in avm_class.method_pyfunctions: + return avm_class.method_pyfunctions[func_name] + if func_name in self._classes_by_name: + return self._classes_by_name[func_name].make_object() + if func_name not in avm_class.methods: + raise ExtractorError('Cannot find function %r' % func_name) + m = avm_class.methods[func_name] + + def resfunc(args): + # Helper functions + coder = io.BytesIO(m.code) + s24 = lambda: _s24(coder) + u30 = lambda: _u30(coder) + + print('Invoking %s.%s(%r)' % (avm_class.name, func_name, tuple(args))) + registers = ['(this)'] + list(args) + [None] * m.local_count + stack = [] + while True: + opcode = _read_byte(coder) + print('opcode: %r, stack(%d): %r' % (opcode, len(stack), stack)) + if opcode == 17: # iftrue + offset = s24() + value = stack.pop() + if value: + coder.seek(coder.tell() + offset) + elif opcode == 36: # pushbyte + v = _read_byte(coder) + stack.append(v) + elif opcode == 44: # pushstring + idx = u30() + stack.append(constant_strings[idx]) + elif opcode == 48: # pushscope + # We don't implement the scope register, so we'll just + # ignore the popped value + new_scope = stack.pop() + elif opcode == 70: # callproperty + index = u30() + mname = self.multinames[index] + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == 'split': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, compat_str) + if args[0] == '': + res = list(obj) + else: + res = obj.split(args[0]) + stack.append(res) + elif mname == 'slice': + assert len(args) == 1 + assert isinstance(args[0], int) + assert isinstance(obj, list) + res = obj[args[0]:] + stack.append(res) + elif mname == 'join': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, list) + res = args[0].join(obj) + stack.append(res) + elif mname in avm_class.method_pyfunctions: + stack.append(avm_class.method_pyfunctions[mname](args)) + else: + raise NotImplementedError( + 'Unsupported property %r on %r' + % (mname, obj)) + elif opcode == 72: # returnvalue + res = stack.pop() + return res + elif opcode == 74: # constructproperty + index = u30() + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + + mname = self.multinames[index] + construct_method = self.extract_function( + obj.avm_class, mname) + # We do not actually call the constructor for now; + # we just pretend it does nothing + stack.append(obj) + elif opcode == 79: # callpropvoid + index = u30() + mname = self.multinames[index] + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == 'reverse': + assert isinstance(obj, list) + obj.reverse() + else: + raise NotImplementedError( + 'Unsupported (void) property %r on %r' + % (mname, obj)) + elif opcode == 86: # newarray + arg_count = u30() + arr = [] + for i in range(arg_count): + arr.append(stack.pop()) + arr = arr[::-1] + stack.append(arr) + elif opcode == 93: # findpropstrict + index = u30() + mname = self.multinames[index] + res = self.extract_function(avm_class, mname) + stack.append(res) + elif opcode == 94: # findproperty + index = u30() + mname = self.multinames[index] + res = avm_class.variables.get(mname) + stack.append(res) + elif opcode == 96: # getlex + index = u30() + mname = self.multinames[index] + res = avm_class.variables.get(mname, None) + stack.append(res) + elif opcode == 97: # setproperty + index = u30() + value = stack.pop() + idx = self.multinames[index] + obj = stack.pop() + obj[idx] = value + elif opcode == 98: # getlocal + index = u30() + stack.append(registers[index]) + elif opcode == 99: # setlocal + index = u30() + value = stack.pop() + registers[index] = value + elif opcode == 102: # getproperty + index = u30() + pname = self.multinames[index] + if pname == 'length': + obj = stack.pop() + assert isinstance(obj, list) + stack.append(len(obj)) + else: # Assume attribute access + idx = stack.pop() + assert isinstance(idx, int) + obj = stack.pop() + assert isinstance(obj, list) + stack.append(obj[idx]) + elif opcode == 128: # coerce + u30() + elif opcode == 133: # coerce_s + assert isinstance(stack[-1], (type(None), compat_str)) + elif opcode == 164: # modulo + value2 = stack.pop() + value1 = stack.pop() + res = value1 % value2 + stack.append(res) + elif opcode == 175: # greaterequals + value2 = stack.pop() + value1 = stack.pop() + result = value1 >= value2 + stack.append(result) + elif opcode == 208: # getlocal_0 + stack.append(registers[0]) + elif opcode == 209: # getlocal_1 + stack.append(registers[1]) + elif opcode == 210: # getlocal_2 + stack.append(registers[2]) + elif opcode == 211: # getlocal_3 + stack.append(registers[3]) + elif opcode == 214: # setlocal_2 + registers[2] = stack.pop() + elif opcode == 215: # setlocal_3 + registers[3] = stack.pop() + else: + raise NotImplementedError( + 'Unsupported opcode %d' % opcode) + + avm_class.method_pyfunctions[func_name] = resfunc + return resfunc + From c45a6caa95e9cea09b84417cfd13ff066986c695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Jul 2014 21:37:40 +0700 Subject: [PATCH 18/73] [utils] Add None check in str_to_int --- youtube_dl/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 64a9618ca6..919603c623 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1194,6 +1194,8 @@ def format_bytes(bytes): def str_to_int(int_str): + if int_str is None: + return None int_str = re.sub(r'[,\.]', u'', int_str) return int(int_str) From e0942e37aa6f20de88f6d5ed7dc288408500371a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Jul 2014 21:39:21 +0700 Subject: [PATCH 19/73] [crackled] Improve, fix invalid regexes and extract more metadata --- youtube_dl/extractor/cracked.py | 61 +++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py index 37c0f7ffb6..74b880ffce 100644 --- a/youtube_dl/extractor/cracked.py +++ b/youtube_dl/extractor/cracked.py @@ -1,23 +1,26 @@ -# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + str_to_int, +) + class CrackedIE(InfoExtractor): - _VALID_URL = r'http?://.*?\.cracked\.com/video_+(?P<id>.*)_.*' + _VALID_URL = r'https?://(?:www\.)?cracked\.com/video_(?P<id>\d+)_[\da-z-]+\.html' _TEST = { - 'url': 'http://www.cracked.com/video_18803_4-social-criticisms-hidden-in-sonic-hedgehog-games.html', - + 'url': 'http://www.cracked.com/video_19006_4-plot-holes-you-didnt-notice-in-your-favorite-movies.html', + 'md5': '4b29a5eeec292cd5eca6388c7558db9e', 'info_dict': { - 'id': '18803', + 'id': '19006', 'ext': 'mp4', - 'title': "4 Social Criticisms Hidden in 'Sonic the Hedgehog' Games | Cracked.com", - 'height': 375, - 'width': 666, - - + 'title': '4 Plot Holes You Didn\'t Notice in Your Favorite Movies', + 'description': 'md5:3b909e752661db86007d10e5ec2df769', + 'timestamp': 1405659600, + 'upload_date': '20140718', } } @@ -26,21 +29,37 @@ class CrackedIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = self._search_regex(r'<title>(.*?)',webpage,'title') - video_url = self._search_regex(r'var CK_vidSrc = "+(.*)"',webpage,'url') - width = self._search_regex(r'width="(.*?)"',webpage,'width') - height = re.findall(r'height="(.*?)"',webpage)[1] + video_url = self._html_search_regex( + [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'([\d,\.]+) Views', webpage, 'view count', fatal=False)) + comment_count = str_to_int(self._html_search_regex( + r'([\d,\.]+)', webpage, 'comment count', fatal=False)) + + m = re.search(r'_(?P\d+)X(?P\d+)\.mp4$', video_url) + if m: + width = int(m.group('width')) + height = int(m.group('height')) + else: + width = height = None return { - 'url':video_url, 'id': video_id, - 'ext':'mp4', - 'title':title, - 'height':int(height), - 'width':int(width) - - + 'url':video_url, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'view_count': view_count, + 'comment_count': comment_count, + 'height': height, + 'width': width, } \ No newline at end of file From 5e95cb27d683c62ff03bd219b5aaf41fc62289bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 18 Jul 2014 21:41:34 +0700 Subject: [PATCH 20/73] Credit @hassaanaliw for cracked (#3274) --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 6e2359b28e..f223b75f4b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -65,6 +65,7 @@ __authors__ = ( 'Tobias Bell', 'Naglis Jonaitis', 'Charles Chen', + 'Hassaan Ali', ) __license__ = 'Public Domain' From 0c1ffe980d82c664ed46c31eae7c589ebcd7fc20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 18 Jul 2014 21:43:01 +0700 Subject: [PATCH 21/73] [mlb] Fix _VALID_URL --- youtube_dl/extractor/mlb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 18ab2c1352..c28be3a7d7 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -11,7 +11,7 @@ from ..utils import ( class MLBIE(InfoExtractor): - _VALID_URL = r'http?://m\.mlb\.com/video/(?:topic/[\da-z_-]+/)?v(?Pn?\d+)' + _VALID_URL = r'https?://m\.mlb\.com/video/(?:topic/[\da-z_-]+/)?v(?Pn?\d+)' _TESTS = [ { 'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', From 23d3c422ab1e38e674ed9f0e6efebb41d5ff63fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 19 Jul 2014 17:47:50 +0700 Subject: [PATCH 22/73] [francetv] Add support for mobile URLs (Closes #3275) --- youtube_dl/extractor/francetv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index f3e0f38b72..1fbe6d1759 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -48,7 +48,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://www\.francetvinfo\.fr/.*/(?P.+)\.html' + _VALID_URL = r'https?://(?:www|mobile)\.francetvinfo\.fr/.*/(?P<title>.+)\.html' _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', @@ -211,7 +211,7 @@ class GenerationQuoiIE(InfoExtractor): class CultureboxIE(FranceTVBaseInfoExtractor): IE_NAME = 'culturebox.francetvinfo.fr' - _VALID_URL = r'https?://culturebox\.francetvinfo\.fr/(?P<name>.*?)(\?|$)' + _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?P<name>.*?)(\?|$)' _TEST = { 'url': 'http://culturebox.francetvinfo.fr/einstein-on-the-beach-au-theatre-du-chatelet-146813', From 604f292ab7701b9284e50877f68ae7fcadcc34bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Jul 2014 00:00:20 +0700 Subject: [PATCH 23/73] [sapo] Add extractor (Closes #2816) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/sapo.py | 119 +++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 youtube_dl/extractor/sapo.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f78aa066ff..a17a80a5f7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -253,6 +253,7 @@ from .rutube import ( RutubePersonIE, ) from .rutv import RUTVIE +from .sapo import SapoIE from .savefrom import SaveFromIE from .scivee import SciVeeIE from .screencast import ScreencastIE diff --git a/youtube_dl/extractor/sapo.py b/youtube_dl/extractor/sapo.py new file mode 100644 index 0000000000..172cc12752 --- /dev/null +++ b/youtube_dl/extractor/sapo.py @@ -0,0 +1,119 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + unified_strdate, +) + + +class SapoIE(InfoExtractor): + IE_DESC = 'SAPO Vídeos' + _VALID_URL = r'https?://(?:(?:v2|www)\.)?videos\.sapo\.(?:pt|cv|ao|mz|tl)/(?P<id>[\da-zA-Z]{20})' + + _TESTS = [ + { + 'url': 'http://videos.sapo.pt/UBz95kOtiWYUMTA5Ghfi', + 'md5': '79ee523f6ecb9233ac25075dee0eda83', + 'note': 'SD video', + 'info_dict': { + 'id': 'UBz95kOtiWYUMTA5Ghfi', + 'ext': 'mp4', + 'title': 'Benfica - Marcas na Hitória', + 'description': 'md5:c9082000a128c3fd57bf0299e1367f22', + 'duration': 264, + 'uploader': 'tiago_1988', + 'upload_date': '20080229', + 'categories': ['benfica', 'cabral', 'desporto', 'futebol', 'geovanni', 'hooijdonk', 'joao', 'karel', 'lisboa', 'miccoli'], + }, + }, + { + 'url': 'http://videos.sapo.pt/IyusNAZ791ZdoCY5H5IF', + 'md5': '90a2f283cfb49193fe06e861613a72aa', + 'note': 'HD video', + 'info_dict': { + 'id': 'IyusNAZ791ZdoCY5H5IF', + 'ext': 'mp4', + 'title': 'Codebits VII - Report', + 'description': 'md5:6448d6fd81ce86feac05321f354dbdc8', + 'duration': 144, + 'uploader': 'codebits', + 'upload_date': '20140427', + 'categories': ['codebits', 'codebits2014'], + }, + }, + { + 'url': 'http://v2.videos.sapo.pt/yLqjzPtbTimsn2wWBKHz', + 'md5': 'e5aa7cc0bdc6db9b33df1a48e49a15ac', + 'note': 'v2 video', + 'info_dict': { + 'id': 'yLqjzPtbTimsn2wWBKHz', + 'ext': 'mp4', + 'title': 'Hipnose Condicionativa 4', + 'description': 'md5:ef0481abf8fb4ae6f525088a6dadbc40', + 'duration': 692, + 'uploader': 'sapozen', + 'upload_date': '20090609', + 'categories': ['condicionativa', 'heloisa', 'hipnose', 'miranda', 'sapo', 'zen'], + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + item = self._download_xml( + 'http://rd3.videos.sapo.pt/%s/rss2' % video_id, video_id).find('./channel/item') + + title = item.find('./title').text + description = item.find('./{http://videos.sapo.pt/mrss/}synopse').text + thumbnail = item.find('./{http://search.yahoo.com/mrss/}content').get('url') + duration = parse_duration(item.find('./{http://videos.sapo.pt/mrss/}time').text) + uploader = item.find('./{http://videos.sapo.pt/mrss/}author').text + upload_date = unified_strdate(item.find('./pubDate').text) + view_count = int(item.find('./{http://videos.sapo.pt/mrss/}views').text) + comment_count = int(item.find('./{http://videos.sapo.pt/mrss/}comment_count').text) + tags = item.find('./{http://videos.sapo.pt/mrss/}tags').text + categories = tags.split() if tags else [] + age_limit = 18 if item.find('./{http://videos.sapo.pt/mrss/}m18').text == 'true' else 0 + + video_url = item.find('./{http://videos.sapo.pt/mrss/}videoFile').text + video_size = item.find('./{http://videos.sapo.pt/mrss/}videoSize').text.split('x') + + formats = [{ + 'url': video_url, + 'ext': 'mp4', + 'format_id': 'sd', + 'width': int(video_size[0]), + 'height': int(video_size[1]), + }] + + if item.find('./{http://videos.sapo.pt/mrss/}HD').text == 'true': + formats.append({ + 'url': re.sub(r'/mov/1$', '/mov/39', video_url), + 'ext': 'mp4', + 'format_id': 'hd', + 'width': 1280, + 'height': 720, + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': uploader, + 'upload_date': upload_date, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + 'age_limit': age_limit, + 'formats': formats, + } From 8adec2b9e05d356a6996ea6f85aa9b4bf0665ce2 Mon Sep 17 00:00:00 2001 From: hassaanaliw <hassaanaliw@gmail.com> Date: Sat, 19 Jul 2014 22:49:25 +0500 Subject: [PATCH 24/73] [snotr] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/snotr.py | 73 ++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 youtube_dl/extractor/snotr.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 78b95c2a5f..faf4735483 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -263,6 +263,7 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) +from .snotr import SnotrIE from .sohu import SohuIE from .soundcloud import ( SoundcloudIE, diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py new file mode 100644 index 0000000000..f89e81bf32 --- /dev/null +++ b/youtube_dl/extractor/snotr.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( + + str_to_int, + parse_iso8601, + + + +) + +class SnotrIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?snotr\.com/video/(?P<id>\d+)/([\w]+)' + _TESTS =[ { + 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', + 'info_dict': { + 'id': '13708', + 'ext': 'flv', + 'title': 'Drone flying through fireworks!', + 'duration': 247, + 'filesize':12320768 + } + }, + + + + { + + 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', + 'info_dict': { + 'id': '530', + 'ext': 'flv', + 'title': 'David Letteman - George W. Bush Top 10', + 'duration': 126, + 'filesize': 1048576 + } + }] + + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + # TODO more code goes here, for example ... + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + + description = self._og_search_description(webpage) + + video_url = "http://cdn.videos.snotr.com/%s.flv" % video_id + + view_count = str_to_int(self._html_search_regex(r'<p>\n<strong>Views:</strong>\n([\d,\.]+)</p>',webpage,'view count')) + + duration = self._html_search_regex(r'<p>\n<strong>Length:</strong>\n(.*?)</p>',webpage,'duration') + duration = str_to_int(duration[:1])*60 + str_to_int(duration[2:4]) + + file_size = self._html_search_regex(r'<p>\n<strong>Filesize:</strong>\n(.*?)</p>',webpage,'filesize') + file_size = str_to_int(re.match(r'\d+',file_size).group())*131072 + + return { + 'id': video_id, + 'title': title, + 'url':video_url, + 'view_count':view_count, + 'duration':duration, + 'filesize':file_size + + } \ No newline at end of file From 0cb2056304178ae8944e84c5bc72f96102291a12 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 18 Jul 2014 14:20:34 +0200 Subject: [PATCH 25/73] [swfinterp] Start working on basic tests --- test/swftests/.gitignore | 1 + test/swftests/LocalVars.as | 13 +++++++ test/test_swfinterp.py | 73 ++++++++++++++++++++++++++++++++++++++ youtube_dl/swfinterp.py | 56 ++++++++++++++++++++--------- 4 files changed, 126 insertions(+), 17 deletions(-) create mode 100644 test/swftests/.gitignore create mode 100644 test/swftests/LocalVars.as create mode 100644 test/test_swfinterp.py diff --git a/test/swftests/.gitignore b/test/swftests/.gitignore new file mode 100644 index 0000000000..da97ff7caf --- /dev/null +++ b/test/swftests/.gitignore @@ -0,0 +1 @@ +*.swf diff --git a/test/swftests/LocalVars.as b/test/swftests/LocalVars.as new file mode 100644 index 0000000000..b2911a9f3e --- /dev/null +++ b/test/swftests/LocalVars.as @@ -0,0 +1,13 @@ +// input: [1, 2] +// output: 3 + +package { +public class LocalVars { + public static function main(a:int, b:int):int{ + var c:int = a + b + b; + var d:int = c - b; + var e:int = d; + return e; + } +} +} diff --git a/test/test_swfinterp.py b/test/test_swfinterp.py new file mode 100644 index 0000000000..98a14a006e --- /dev/null +++ b/test/test_swfinterp.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +import io +import json +import re +import subprocess + +from youtube_dl.swfinterp import SWFInterpreter + + +TEST_DIR = os.path.join( + os.path.dirname(os.path.abspath(__file__)), 'swftests') + + +class TestSWFInterpreter(unittest.TestCase): + pass + + +for testfile in os.listdir(TEST_DIR): + m = re.match(r'^(.*)\.(as)$', testfile) + if not m: + continue + test_id = m.group(1) + + def test_func(self): + as_file = os.path.join(TEST_DIR, testfile) + swf_file = os.path.join(TEST_DIR, test_id + '.swf') + if ((not os.path.exists(swf_file)) + or os.path.getmtime(swf_file) < os.path.getmtime(as_file)): + # Recompile + try: + subprocess.check_call(['mxmlc', '--output', swf_file, as_file]) + except OSError as ose: + if ose.errno == errno.ENOENT: + print('mxmlc not found! Skipping test.') + return + raise + + with open(swf_file, 'rb') as swf_f: + swf_content = swf_f.read() + swfi = SWFInterpreter(swf_content) + + with io.open(as_file, 'r', encoding='utf-8') as as_f: + as_content = as_f.read() + + def _find_spec(key): + m = re.search( + r'(?m)^//\s*%s:\s*(.*?)\n' % re.escape(key), as_content) + if not m: + raise ValueError('Cannot find %s in %s' % (key, testfile)) + return json.loads(m.group(1)) + + input_args = _find_spec('input') + output = _find_spec('output') + + swf_class = swfi.extract_class(test_id) + func = swfi.extract_function(swf_class, 'main') + res = func(input_args) + self.assertEqual(res, output) + + test_func.__name__ = str('test_swf_' + test_id) + setattr(TestSWFInterpreter, test_func.__name__, test_func) + + +if __name__ == '__main__': + unittest.main() diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index 1cd2921386..49fade364a 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -8,8 +8,22 @@ import zlib from .utils import ExtractorError -def _extract_tags(content): - pos = 0 +def _extract_tags(file_contents): + if file_contents[1:3] != b'WS': + raise ExtractorError( + 'Not an SWF file; header is %r' % file_contents[:3]) + if file_contents[:1] == b'C': + content = zlib.decompress(file_contents[8:]) + else: + raise NotImplementedError( + 'Unsupported compression format %r' % + file_contents[:1]) + + # Determine number of bits in framesize rectangle + framesize_nbits = struct.unpack('!B', content[:1])[0] >> 3 + framesize_len = (5 + 4 * framesize_nbits + 7) // 8 + + pos = framesize_len + 2 + 2 while pos < len(content): header16 = struct.unpack('<H', content[pos:pos + 2])[0] pos += 2 @@ -18,7 +32,9 @@ def _extract_tags(content): if tag_len == 0x3f: tag_len = struct.unpack('<I', content[pos:pos + 4])[0] pos += 4 - assert pos + tag_len <= len(content) + assert pos + tag_len <= len(content), \ + ('Tag %d ends at %d+%d - that\'s longer than the file (%d)' + % (tag_code, pos, tag_len, len(content))) yield (tag_code, content[pos:pos + tag_len]) pos += tag_len @@ -88,8 +104,7 @@ def _read_string(reader): def _read_bytes(count, reader): - if reader is None: - reader = code_reader + assert count >= 0 resb = reader.read(count) assert len(resb) == count return resb @@ -103,18 +118,8 @@ def _read_byte(reader): class SWFInterpreter(object): def __init__(self, file_contents): - if file_contents[1:3] != b'WS': - raise ExtractorError( - 'Not an SWF file; header is %r' % file_contents[:3]) - if file_contents[:1] == b'C': - content = zlib.decompress(file_contents[8:]) - else: - raise NotImplementedError( - 'Unsupported compression format %r' % - file_contents[:1]) - code_tag = next(tag - for tag_code, tag in _extract_tags(content) + for tag_code, tag in _extract_tags(file_contents) if tag_code == 82) p = code_tag.index(b'\0', 4) + 1 code_reader = io.BytesIO(code_tag[p:]) @@ -139,7 +144,7 @@ class SWFInterpreter(object): for _c in range(1, uint_count): u32() double_count = u30() - read_bytes((double_count - 1) * 8) + read_bytes(max(0, (double_count - 1)) * 8) string_count = u30() constant_strings = [''] for _c in range(1, string_count): @@ -349,6 +354,9 @@ class SWFInterpreter(object): elif opcode == 36: # pushbyte v = _read_byte(coder) stack.append(v) + elif opcode == 42: # dup + value = stack[-1] + stack.append(value) elif opcode == 44: # pushstring idx = u30() stack.append(constant_strings[idx]) @@ -468,10 +476,24 @@ class SWFInterpreter(object): obj = stack.pop() assert isinstance(obj, list) stack.append(obj[idx]) + elif opcode == 115: # convert_ + value = stack.pop() + intvalue = int(value) + stack.append(intvalue) elif opcode == 128: # coerce u30() elif opcode == 133: # coerce_s assert isinstance(stack[-1], (type(None), compat_str)) + elif opcode == 160: # add + value2 = stack.pop() + value1 = stack.pop() + res = value1 + value2 + stack.append(res) + elif opcode == 161: # subtract + value2 = stack.pop() + value1 = stack.pop() + res = value1 - value2 + stack.append(res) elif opcode == 164: # modulo value2 = stack.pop() value1 = stack.pop() From e75c24e88907f329c57cf05d729dbf599349bb50 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 00:03:54 +0200 Subject: [PATCH 26/73] [swfinterp] Extend tests and fix parsing --- test/swftests/StaticAssignment.as | 13 +++++++++ test/swftests/StaticRetrieval.as | 16 +++++++++++ test/test_swfinterp.py | 9 ++++-- youtube_dl/swfinterp.py | 47 +++++++++++++++++++++++-------- 4 files changed, 70 insertions(+), 15 deletions(-) create mode 100644 test/swftests/StaticAssignment.as create mode 100644 test/swftests/StaticRetrieval.as diff --git a/test/swftests/StaticAssignment.as b/test/swftests/StaticAssignment.as new file mode 100644 index 0000000000..b061c219d2 --- /dev/null +++ b/test/swftests/StaticAssignment.as @@ -0,0 +1,13 @@ +// input: [1] +// output: 1 + +package { +public class StaticAssignment { + public static var v:int; + + public static function main(a:int):int{ + v = a; + return v; + } +} +} diff --git a/test/swftests/StaticRetrieval.as b/test/swftests/StaticRetrieval.as new file mode 100644 index 0000000000..c8352d819d --- /dev/null +++ b/test/swftests/StaticRetrieval.as @@ -0,0 +1,16 @@ +// input: [] +// output: 1 + +package { +public class StaticRetrieval { + public static var v:int; + + public static function main():int{ + if (v) { + return 0; + } else { + return 1; + } + } +} +} diff --git a/test/test_swfinterp.py b/test/test_swfinterp.py index 98a14a006e..3bb5a6308c 100644 --- a/test/test_swfinterp.py +++ b/test/test_swfinterp.py @@ -23,10 +23,10 @@ class TestSWFInterpreter(unittest.TestCase): pass -for testfile in os.listdir(TEST_DIR): +def _make_testfunc(testfile): m = re.match(r'^(.*)\.(as)$', testfile) if not m: - continue + return test_id = m.group(1) def test_func(self): @@ -36,7 +36,7 @@ for testfile in os.listdir(TEST_DIR): or os.path.getmtime(swf_file) < os.path.getmtime(as_file)): # Recompile try: - subprocess.check_call(['mxmlc', '--output', swf_file, as_file]) + subprocess.check_call(['mxmlc', '-output', swf_file, as_file]) except OSError as ose: if ose.errno == errno.ENOENT: print('mxmlc not found! Skipping test.') @@ -69,5 +69,8 @@ for testfile in os.listdir(TEST_DIR): setattr(TestSWFInterpreter, test_func.__name__, test_func) +for testfile in os.listdir(TEST_DIR): + _make_testfunc(testfile) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index 49fade364a..64a518fc61 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -39,6 +39,16 @@ def _extract_tags(file_contents): pos += tag_len +class _AVM_Object(object): + def __init__(self, value=None, name_hint=None): + self.value = value + self.name_hint = name_hint + + def __repr__(self): + nh = '' if self.name_hint is None else (' %s' % self.name_hint) + return 'AVMObject%s(%r)' % (nh, self.value) + + class _AVMClass_Object(object): def __init__(self, avm_class): self.avm_class = avm_class @@ -92,8 +102,8 @@ def _s32(reader): def _s24(reader): bs = reader.read(3) assert len(bs) == 3 - first_byte = b'\xff' if (ord(bs[0:1]) >= 0x80) else b'\x00' - return struct.unpack('!i', first_byte + bs) + last_byte = b'\xff' if (ord(bs[2:3]) >= 0x80) else b'\x00' + return struct.unpack('<i', bs + last_byte)[0] def _read_string(reader): @@ -341,8 +351,9 @@ class SWFInterpreter(object): u30 = lambda: _u30(coder) print('Invoking %s.%s(%r)' % (avm_class.name, func_name, tuple(args))) - registers = ['(this)'] + list(args) + [None] * m.local_count + registers = [avm_class.variables] + list(args) + [None] * m.local_count stack = [] + scopes = collections.deque([avm_class.variables]) while True: opcode = _read_byte(coder) print('opcode: %r, stack(%d): %r' % (opcode, len(stack), stack)) @@ -351,6 +362,11 @@ class SWFInterpreter(object): value = stack.pop() if value: coder.seek(coder.tell() + offset) + elif opcode == 18: # iffalse + offset = s24() + value = stack.pop() + if not value: + coder.seek(coder.tell() + offset) elif opcode == 36: # pushbyte v = _read_byte(coder) stack.append(v) @@ -361,9 +377,8 @@ class SWFInterpreter(object): idx = u30() stack.append(constant_strings[idx]) elif opcode == 48: # pushscope - # We don't implement the scope register, so we'll just - # ignore the popped value new_scope = stack.pop() + scopes.append(new_scope) elif opcode == 70: # callproperty index = u30() mname = self.multinames[index] @@ -435,20 +450,28 @@ class SWFInterpreter(object): arr.append(stack.pop()) arr = arr[::-1] stack.append(arr) - elif opcode == 93: # findpropstrict - index = u30() - mname = self.multinames[index] - res = self.extract_function(avm_class, mname) - stack.append(res) elif opcode == 94: # findproperty index = u30() mname = self.multinames[index] - res = avm_class.variables.get(mname) + for s in reversed(scopes): + if mname in s: + res = s + break + else: + res = scopes[0] stack.append(res) elif opcode == 96: # getlex index = u30() mname = self.multinames[index] - res = avm_class.variables.get(mname, None) + for s in reversed(scopes): + if mname in s: + scope = s + break + else: + scope = scopes[0] + # I cannot find where static variables are initialized + # so let's just return None + res = scope.get(mname) stack.append(res) elif opcode == 97: # setproperty index = u30() From 70f767dc65189df0118d319d62385e54bd9bb03e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 00:25:58 +0200 Subject: [PATCH 27/73] [swfinterp] Add support for multiple classes --- test/swftests/ClassConstruction.as | 15 ++++++++++ youtube_dl/swfinterp.py | 48 ++++++++++++++++++++++++++---- 2 files changed, 57 insertions(+), 6 deletions(-) create mode 100644 test/swftests/ClassConstruction.as diff --git a/test/swftests/ClassConstruction.as b/test/swftests/ClassConstruction.as new file mode 100644 index 0000000000..436479f8fc --- /dev/null +++ b/test/swftests/ClassConstruction.as @@ -0,0 +1,15 @@ +// input: [] +// output: 0 + +package { +public class ClassConstruction { + public static function main():int{ + var f:Foo = new Foo(); + return 0; + } +} +} + +class Foo { + +} diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index 64a518fc61..f7a3889a0d 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -5,7 +5,10 @@ import io import struct import zlib -from .utils import ExtractorError +from .utils import ( + compat_str, + ExtractorError, +) def _extract_tags(file_contents): @@ -65,7 +68,26 @@ class _AVMClass(object): self.method_idxs = {} self.methods = {} self.method_pyfunctions = {} - self.variables = {} + + class ScopeDict(dict): + def __init__(self, avm_class): + super(ScopeDict, self).__init__() + self.avm_class = avm_class + + def __getitem__(self, k): + print('getting %r' % k) + return super(ScopeDict, self).__getitem__(k) + + def __contains__(self, k): + print('contains %r' % k) + return super(ScopeDict, self).__contains__(k) + + def __repr__(self): + return '%s__Scope(%s)' % ( + self.avm_class.name, + super(ScopeDict, self).__repr__()) + + self.variables = ScopeDict(self) def make_object(self): return _AVMClass_Object(self) @@ -156,10 +178,10 @@ class SWFInterpreter(object): double_count = u30() read_bytes(max(0, (double_count - 1)) * 8) string_count = u30() - constant_strings = [''] + self.constant_strings = [''] for _c in range(1, string_count): s = _read_string(code_reader) - constant_strings.append(s) + self.constant_strings.append(s) namespace_count = u30() for _c in range(1, namespace_count): read_bytes(1) # kind @@ -189,7 +211,7 @@ class SWFInterpreter(object): if kind == 0x07: u30() # namespace_idx name_idx = u30() - self.multinames.append(constant_strings[name_idx]) + self.multinames.append(self.constant_strings[name_idx]) else: self.multinames.append('[MULTINAME kind: %d]' % kind) for _c2 in range(MULTINAME_SIZES[kind]): @@ -375,7 +397,7 @@ class SWFInterpreter(object): stack.append(value) elif opcode == 44: # pushstring idx = u30() - stack.append(constant_strings[idx]) + stack.append(self.constant_strings[idx]) elif opcode == 48: # pushscope new_scope = stack.pop() scopes.append(new_scope) @@ -450,6 +472,16 @@ class SWFInterpreter(object): arr.append(stack.pop()) arr = arr[::-1] stack.append(arr) + elif opcode == 93: # findpropstrict + index = u30() + mname = self.multinames[index] + for s in reversed(scopes): + if mname in s: + res = s + break + else: + res = scopes[0] + stack.append(res) elif opcode == 94: # findproperty index = u30() mname = self.multinames[index] @@ -535,6 +567,10 @@ class SWFInterpreter(object): stack.append(registers[2]) elif opcode == 211: # getlocal_3 stack.append(registers[3]) + elif opcode == 212: # setlocal_0 + registers[0] = stack.pop() + elif opcode == 213: # setlocal_1 + registers[1] = stack.pop() elif opcode == 214: # setlocal_2 registers[2] = stack.pop() elif opcode == 215: # setlocal_3 From 01b4b745749bb92b4a56b4201d699740cbf450ab Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 12:47:15 +0200 Subject: [PATCH 28/73] [swfinterp] Add support for calls to instance methods --- test/swftests/ClassCall.as | 17 ++++++ youtube_dl/swfinterp.py | 117 +++++++++++++++++++------------------ 2 files changed, 77 insertions(+), 57 deletions(-) create mode 100644 test/swftests/ClassCall.as diff --git a/test/swftests/ClassCall.as b/test/swftests/ClassCall.as new file mode 100644 index 0000000000..aef58daf37 --- /dev/null +++ b/test/swftests/ClassCall.as @@ -0,0 +1,17 @@ +// input: [] +// output: 121 + +package { +public class ClassCall { + public static function main():int{ + var f:OtherClass = new OtherClass(); + return f.func(100,20); + } +} +} + +class OtherClass { + public function func(x: int, y: int):int { + return x+y+1; + } +} diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index f7a3889a0d..8ccb64c9d4 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -42,16 +42,6 @@ def _extract_tags(file_contents): pos += tag_len -class _AVM_Object(object): - def __init__(self, value=None, name_hint=None): - self.value = value - self.name_hint = name_hint - - def __repr__(self): - nh = '' if self.name_hint is None else (' %s' % self.name_hint) - return 'AVMObject%s(%r)' % (nh, self.value) - - class _AVMClass_Object(object): def __init__(self, avm_class): self.avm_class = avm_class @@ -74,14 +64,6 @@ class _AVMClass(object): super(ScopeDict, self).__init__() self.avm_class = avm_class - def __getitem__(self, k): - print('getting %r' % k) - return super(ScopeDict, self).__getitem__(k) - - def __contains__(self, k): - print('contains %r' % k) - return super(ScopeDict, self).__contains__(k) - def __repr__(self): return '%s__Scope(%s)' % ( self.avm_class.name, @@ -92,6 +74,15 @@ class _AVMClass(object): def make_object(self): return _AVMClass_Object(self) + def __repr__(self): + return '_AVMClass(%s)' % (self.name) + + def register_methods(self, methods): + self.method_names.update(methods.items()) + self.method_idxs.update(dict( + (idx, name) + for name, idx in methods.items())) + def _read_int(reader): res = 0 @@ -290,7 +281,11 @@ class SWFInterpreter(object): classes = [] for class_id in range(class_count): name_idx = u30() - classes.append(_AVMClass(name_idx, self.multinames[name_idx])) + + cname = self.multinames[name_idx] + avm_class = _AVMClass(name_idx, cname) + classes.append(avm_class) + u30() # super_name idx flags = read_byte() if flags & 0x08 != 0: # Protected namespace is present @@ -301,7 +296,9 @@ class SWFInterpreter(object): u30() # iinit trait_count = u30() for _c2 in range(trait_count): - parse_traits_info() + trait_methods = parse_traits_info() + avm_class.register_methods(trait_methods) + assert len(classes) == class_count self._classes_by_name = dict((c.name, c) for c in classes) @@ -310,10 +307,7 @@ class SWFInterpreter(object): trait_count = u30() for _c2 in range(trait_count): trait_methods = parse_traits_info() - avm_class.method_names.update(trait_methods.items()) - avm_class.method_idxs.update(dict( - (idx, name) - for name, idx in trait_methods.items())) + avm_class.register_methods(trait_methods) # Scripts script_count = u30() @@ -358,12 +352,14 @@ class SWFInterpreter(object): raise ExtractorError('Class %r not found' % class_name) def extract_function(self, avm_class, func_name): + print('Extracting %s.%s' % (avm_class.name, func_name)) if func_name in avm_class.method_pyfunctions: return avm_class.method_pyfunctions[func_name] if func_name in self._classes_by_name: return self._classes_by_name[func_name].make_object() if func_name not in avm_class.methods: - raise ExtractorError('Cannot find function %r' % func_name) + raise ExtractorError('Cannot find function %s.%s' % ( + avm_class.name, func_name)) m = avm_class.methods[func_name] def resfunc(args): @@ -375,7 +371,8 @@ class SWFInterpreter(object): print('Invoking %s.%s(%r)' % (avm_class.name, func_name, tuple(args))) registers = [avm_class.variables] + list(args) + [None] * m.local_count stack = [] - scopes = collections.deque([avm_class.variables]) + scopes = collections.deque([ + self._classes_by_name, avm_class.variables]) while True: opcode = _read_byte(coder) print('opcode: %r, stack(%d): %r' % (opcode, len(stack), stack)) @@ -408,33 +405,38 @@ class SWFInterpreter(object): args = list(reversed( [stack.pop() for _ in range(arg_count)])) obj = stack.pop() - if mname == 'split': - assert len(args) == 1 - assert isinstance(args[0], compat_str) - assert isinstance(obj, compat_str) - if args[0] == '': - res = list(obj) - else: - res = obj.split(args[0]) + + if isinstance(obj, _AVMClass_Object): + func = self.extract_function(obj.avm_class, mname) + res = func(args) stack.append(res) - elif mname == 'slice': - assert len(args) == 1 - assert isinstance(args[0], int) - assert isinstance(obj, list) - res = obj[args[0]:] - stack.append(res) - elif mname == 'join': - assert len(args) == 1 - assert isinstance(args[0], compat_str) - assert isinstance(obj, list) - res = args[0].join(obj) - stack.append(res) - elif mname in avm_class.method_pyfunctions: - stack.append(avm_class.method_pyfunctions[mname](args)) - else: - raise NotImplementedError( - 'Unsupported property %r on %r' - % (mname, obj)) + continue + elif isinstance(obj, compat_str): + if mname == 'split': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + if args[0] == '': + res = list(obj) + else: + res = obj.split(args[0]) + stack.append(res) + continue + elif isinstance(obj, list): + if mname == 'slice': + assert len(args) == 1 + assert isinstance(args[0], int) + res = obj[args[0]:] + stack.append(res) + continue + elif mname == 'join': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + res = args[0].join(obj) + stack.append(res) + continue + raise NotImplementedError( + 'Unsupported property %r on %r' + % (mname, obj)) elif opcode == 72: # returnvalue res = stack.pop() return res @@ -446,11 +448,12 @@ class SWFInterpreter(object): obj = stack.pop() mname = self.multinames[index] + assert isinstance(obj, _AVMClass) construct_method = self.extract_function( - obj.avm_class, mname) + obj, mname) # We do not actually call the constructor for now; # we just pretend it does nothing - stack.append(obj) + stack.append(obj.make_object()) elif opcode == 79: # callpropvoid index = u30() mname = self.multinames[index] @@ -481,7 +484,7 @@ class SWFInterpreter(object): break else: res = scopes[0] - stack.append(res) + stack.append(res[mname]) elif opcode == 94: # findproperty index = u30() mname = self.multinames[index] @@ -490,7 +493,7 @@ class SWFInterpreter(object): res = s break else: - res = scopes[0] + res = avm_class.variables stack.append(res) elif opcode == 96: # getlex index = u30() @@ -500,7 +503,7 @@ class SWFInterpreter(object): scope = s break else: - scope = scopes[0] + scope = avm_class.variables # I cannot find where static variables are initialized # so let's just return None res = scope.get(mname) From 0d989011fffd768116d0ca81f6c067c7e0876f36 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 14:49:10 +0200 Subject: [PATCH 29/73] [swfinterp] Add support for calling methods on objects --- test/swftests/PrivateCall.as | 21 +++++++++++++++++++++ youtube_dl/swfinterp.py | 31 ++++++++++++++++++++----------- 2 files changed, 41 insertions(+), 11 deletions(-) create mode 100644 test/swftests/PrivateCall.as diff --git a/test/swftests/PrivateCall.as b/test/swftests/PrivateCall.as new file mode 100644 index 0000000000..f1c110a379 --- /dev/null +++ b/test/swftests/PrivateCall.as @@ -0,0 +1,21 @@ +// input: [] +// output: 9 + +package { +public class PrivateCall { + public static function main():int{ + var f:OtherClass = new OtherClass(); + return f.func(); + } +} +} + +class OtherClass { + private function pf():int { + return 9; + } + + public function func():int { + return this.pf(); + } +} diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index 8ccb64c9d4..d043c2f992 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -50,6 +50,17 @@ class _AVMClass_Object(object): return '%s#%x' % (self.avm_class.name, id(self)) +class _ScopeDict(dict): + def __init__(self, avm_class): + super(_ScopeDict, self).__init__() + self.avm_class = avm_class + + def __repr__(self): + return '%s__Scope(%s)' % ( + self.avm_class.name, + super(_ScopeDict, self).__repr__()) + + class _AVMClass(object): def __init__(self, name_idx, name): self.name_idx = name_idx @@ -59,17 +70,7 @@ class _AVMClass(object): self.methods = {} self.method_pyfunctions = {} - class ScopeDict(dict): - def __init__(self, avm_class): - super(ScopeDict, self).__init__() - self.avm_class = avm_class - - def __repr__(self): - return '%s__Scope(%s)' % ( - self.avm_class.name, - super(ScopeDict, self).__repr__()) - - self.variables = ScopeDict(self) + self.variables = _ScopeDict(self) def make_object(self): return _AVMClass_Object(self) @@ -411,6 +412,14 @@ class SWFInterpreter(object): res = func(args) stack.append(res) continue + elif isinstance(obj, _ScopeDict): + if mname in obj.avm_class.method_names: + func = self.extract_function(obj.avm_class, mname) + res = func(args) + else: + res = obj[mname] + stack.append(res) + continue elif isinstance(obj, compat_str): if mname == 'split': assert len(args) == 1 From decf2ae400d52e98bcd073a69b24b3dbf3d38d53 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 18:28:49 +0200 Subject: [PATCH 30/73] [swfinterp] Correct array access --- test/swftests/ArrayAccess.as | 19 +++++++++++++++++++ youtube_dl/swfinterp.py | 20 +++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 test/swftests/ArrayAccess.as diff --git a/test/swftests/ArrayAccess.as b/test/swftests/ArrayAccess.as new file mode 100644 index 0000000000..e22caa3860 --- /dev/null +++ b/test/swftests/ArrayAccess.as @@ -0,0 +1,19 @@ +// input: [["a", "b", "c", "d"]] +// output: ["c", "b", "a", "d"] + +package { +public class ArrayAccess { + public static function main(ar:Array):Array { + var aa:ArrayAccess = new ArrayAccess(); + return aa.f(ar, 2); + } + + private function f(ar:Array, num:Number):Array{ + var x:String = ar[0]; + var y:String = ar[num % ar.length]; + ar[0] = y; + ar[num] = x; + return ar; + } +} +} diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index d043c2f992..812ee7e8c5 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -85,6 +85,14 @@ class _AVMClass(object): for name, idx in methods.items())) +class _Multiname(object): + def __init__(self, kind): + self.kind = kind + + def __repr__(self): + return '[MULTINAME kind: 0x%x]' % self.kind + + def _read_int(reader): res = 0 shift = 0 @@ -205,7 +213,7 @@ class SWFInterpreter(object): name_idx = u30() self.multinames.append(self.constant_strings[name_idx]) else: - self.multinames.append('[MULTINAME kind: %d]' % kind) + self.multinames.append(_Multiname(kind)) for _c2 in range(MULTINAME_SIZES[kind]): u30() @@ -399,6 +407,13 @@ class SWFInterpreter(object): elif opcode == 48: # pushscope new_scope = stack.pop() scopes.append(new_scope) + elif opcode == 66: # construct + arg_count = u30() + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + res = obj.avm_class.make_object() + stack.append(res) elif opcode == 70: # callproperty index = u30() mname = self.multinames[index] @@ -521,7 +536,10 @@ class SWFInterpreter(object): index = u30() value = stack.pop() idx = self.multinames[index] + if isinstance(idx, _Multiname): + idx = stack.pop() obj = stack.pop() + print('Setting %r.%r = %r' % (obj, idx, value)) obj[idx] = value elif opcode == 98: # getlocal index = u30() From 1b38b5be867a01808390ee320fa03c6512177a9b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 18:29:09 +0200 Subject: [PATCH 31/73] [swfinterp] Remove debugging code --- youtube_dl/swfinterp.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index 812ee7e8c5..79d86152d7 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -361,7 +361,6 @@ class SWFInterpreter(object): raise ExtractorError('Class %r not found' % class_name) def extract_function(self, avm_class, func_name): - print('Extracting %s.%s' % (avm_class.name, func_name)) if func_name in avm_class.method_pyfunctions: return avm_class.method_pyfunctions[func_name] if func_name in self._classes_by_name: @@ -377,14 +376,12 @@ class SWFInterpreter(object): s24 = lambda: _s24(coder) u30 = lambda: _u30(coder) - print('Invoking %s.%s(%r)' % (avm_class.name, func_name, tuple(args))) registers = [avm_class.variables] + list(args) + [None] * m.local_count stack = [] scopes = collections.deque([ self._classes_by_name, avm_class.variables]) while True: opcode = _read_byte(coder) - print('opcode: %r, stack(%d): %r' % (opcode, len(stack), stack)) if opcode == 17: # iftrue offset = s24() value = stack.pop() @@ -539,7 +536,6 @@ class SWFInterpreter(object): if isinstance(idx, _Multiname): idx = stack.pop() obj = stack.pop() - print('Setting %r.%r = %r' % (obj, idx, value)) obj[idx] = value elif opcode == 98: # getlocal index = u30() From 7fd48d0413f1325619562a0ad0580e4a7fff34e1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 18:30:27 +0200 Subject: [PATCH 32/73] [youtube] Correct signature testcase --- test/test_youtube_signature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index e443e0be88..609e7078cf 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -49,7 +49,7 @@ _TESTS = [ u'http://s.ytimg.com/yts/swfbin/player-vfl5vIhK2/watch_as3.swf', u'swf', 86, - u'23456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!?#$%&\'()*+,-./:;<=>"' + u'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVWXY\\!"#$%&\'()*+,-./:;<=>?' ), ] From cceb5ec2370ea1de950520937361c3f9ed4fe12a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 18:47:03 +0200 Subject: [PATCH 33/73] release 2014.07.20 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4d606c3d23..0214086d04 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.07.15' +__version__ = '2014.07.20' From a5d524ef4676062335303ad249cc35a1a237ad07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 21 Jul 2014 00:28:55 +0700 Subject: [PATCH 34/73] [allocine] Update tests --- youtube_dl/extractor/allocine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py index 34f0cd49ba..7bd7978841 100644 --- a/youtube_dl/extractor/allocine.py +++ b/youtube_dl/extractor/allocine.py @@ -32,7 +32,7 @@ class AllocineIE(InfoExtractor): 'id': '19540403', 'ext': 'mp4', 'title': 'Planes 2 Bande-annonce VF', - 'description': 'md5:c4b1f7bd682a91de6491ada267ec0f4d', + 'description': 'md5:eeaffe7c2d634525e21159b93acf3b1e', 'thumbnail': 're:http://.*\.jpg', }, }, { @@ -42,7 +42,7 @@ class AllocineIE(InfoExtractor): 'id': '19544709', 'ext': 'mp4', 'title': 'Dragons 2 - Bande annonce finale VF', - 'description': 'md5:e74a4dc750894bac300ece46c7036490', + 'description': 'md5:71742e3a74b0d692c7fce0dd2017a4ac', 'thumbnail': 're:http://.*\.jpg', }, }] From b8c74d606aff4a325cdcc951df99495690f40054 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 20:20:42 +0200 Subject: [PATCH 35/73] [youtube] fix display of swf player id --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 623056bd96..6e77504bf1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -834,7 +834,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: if player_url.endswith('swf'): player_version = self._search_regex( - r'-(.+)\.swf$', player_url, + r'-(.+?)(?:/watch_as3)?\.swf$', player_url, u'flash player', fatal=False) player_desc = 'flash player %s' % player_version else: From b6ea11b9675fadd6961fb46ea35bfb88df902ae5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 20:45:36 +0200 Subject: [PATCH 36/73] [youtube] Add swf signature test case (#3270) --- test/test_youtube_signature.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 609e7078cf..e8a67c4c01 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -51,6 +51,12 @@ _TESTS = [ 86, u'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVWXY\\!"#$%&\'()*+,-./:;<=>?' ), + ( + u'http://s.ytimg.com/yts/swfbin/player-vflmDyk47/watch_as3.swf', + u'swf', + u'F375F75BF2AFDAAF2666E43868D46816F83F13E81C46.3725A8218E446A0DECD33F79DC282994D6AA92C92C9', + u'9C29AA6D499282CD97F33DCED0A644E8128A5273.64C18E31F38361864D86834E6662FAADFA2FB57F' + ), ] From 2c57c7fa5a1871d750ded6610c4fd6ee55bd96a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 20 Jul 2014 21:05:02 +0200 Subject: [PATCH 37/73] [youtube] Fix extraction of age gate videos (closes #3270) Setting the correct value of the 'sts' paramater in the 'get_video_info' url gives the correct urls. Removed parameters that are not needed. --- youtube_dl/extractor/youtube.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6e77504bf1..071aa75195 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -609,14 +609,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube - data = compat_urllib_parse.urlencode({'video_id': video_id, - 'el': 'player_embedded', - 'gl': 'US', - 'hl': 'en', - 'eurl': 'https://youtube.googleapis.com/v/' + video_id, - 'asv': 3, - 'sts':'1588', - }) + data = compat_urllib_parse.urlencode({ + 'video_id': video_id, + 'eurl': 'https://youtube.googleapis.com/v/' + video_id, + 'sts':'16268', + }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data video_info_webpage = self._download_webpage(video_info_url, video_id, note=False, From 29546b345b4926e46f1259a8ea08826009baf1a9 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 21:38:02 +0200 Subject: [PATCH 38/73] [ard] Add support for NDR-style videos (fixes #3281) --- youtube_dl/extractor/ard.py | 103 ++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index b36a4d46a6..30a85c8c1c 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -7,23 +7,32 @@ from .common import InfoExtractor from ..utils import ( determine_ext, ExtractorError, + qualities, ) class ARDIE(InfoExtractor): - _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' + _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' - _TEST = { - 'url': 'http://www.ardmediathek.de/das-erste/guenther-jauch/edward-snowden-im-interview-held-oder-verraeter?documentId=19288786', - 'file': '19288786.mp4', - 'md5': '515bf47ce209fb3f5a61b7aad364634c', + _TESTS = [{ + 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', + 'file': '22429276.mp4', + 'md5': '469751912f1de0816a9fc9df8336476c', 'info_dict': { - 'title': 'Edward Snowden im Interview - Held oder Verräter?', - 'description': 'Edward Snowden hat alles aufs Spiel gesetzt, um die weltweite \xdcberwachung durch die Geheimdienste zu enttarnen. Nun stellt sich der ehemalige NSA-Mitarbeiter erstmals weltweit in einem TV-Interview den Fragen eines NDR-Journalisten. Die Sendung vom Sonntagabend.', - 'thumbnail': 'http://www.ardmediathek.de/ard/servlet/contentblob/19/28/87/90/19288790/bild/2250037', + 'title': 'Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?', + 'description': 'Das Erste Mediathek [ARD]: Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?, Anne Will, Über die Spionage-Affäre diskutieren Clemens Binninger, Katrin Göring-Eckardt, Georg Mascolo, Andrew B. Denison und Constanze Kurz.. Das Video zur Sendung Anne Will am Mittwoch, 16.07.2014', }, 'skip': 'Blocked outside of Germany', - } + }, { + 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916', + 'info_dict': { + 'id': '22490580', + 'ext': 'mp4', + 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)', + 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.', + }, + 'skip': 'Blocked outside of Germany', + }] def _real_extract(self, url): # determine video id from url @@ -43,40 +52,64 @@ class ARDIE(InfoExtractor): r'<h4 class="headline">(.*?)</h4>'], webpage, 'title') description = self._html_search_meta( - 'dcterms.abstract', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) + 'dcterms.abstract', webpage, 'description', default=None) + if description is None: + description = self._html_search_meta( + 'description', webpage, 'meta description') + # Thumbnail is sometimes not present. + # It is in the mobile version, but that seems to use a different URL + # structure altogether. + thumbnail = self._og_search_thumbnail(webpage, default=None) - media_info = self._download_json( - 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id) - # The second element of the _mediaArray contains the standard http urls - streams = media_info['_mediaArray'][1]['_mediaStreamArray'] - if not streams: - if '"fsk"' in webpage: - raise ExtractorError('This video is only available after 20:00') + media_streams = re.findall(r'''(?x) + mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s* + "([^"]+)"''', webpage) - formats = [] + if media_streams: + QUALITIES = qualities(['lo', 'hi', 'hq']) + formats = [] + for furl in set(media_streams): + if furl.endswith('.f4m'): + fid = 'f4m' + else: + fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl) + fid = fid_m.group(1) if fid_m else None + formats.append({ + 'quality': QUALITIES(fid), + 'format_id': fid, + 'url': furl, + }) + else: # request JSON file + media_info = self._download_json( + 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id) + # The second element of the _mediaArray contains the standard http urls + streams = media_info['_mediaArray'][1]['_mediaStreamArray'] + if not streams: + if '"fsk"' in webpage: + raise ExtractorError('This video is only available after 20:00') - for s in streams: - if type(s['_stream']) == list: - for index, url in enumerate(s['_stream'][::-1]): - quality = s['_quality'] + index - formats.append({ - 'quality': quality, - 'url': url, - 'format_id': '%s-%s' % (determine_ext(url), quality) + formats = [] + for s in streams: + if type(s['_stream']) == list: + for index, url in enumerate(s['_stream'][::-1]): + quality = s['_quality'] + index + formats.append({ + 'quality': quality, + 'url': url, + 'format_id': '%s-%s' % (determine_ext(url), quality) }) - continue + continue - format = { - 'quality': s['_quality'], - 'url': s['_stream'], - } + format = { + 'quality': s['_quality'], + 'url': s['_stream'], + } - format['format_id'] = '%s-%s' % ( - determine_ext(format['url']), format['quality']) + format['format_id'] = '%s-%s' % ( + determine_ext(format['url']), format['quality']) - formats.append(format) + formats.append(format) self._sort_formats(formats) From f3308e138df3458581c8824d0d3eaa46b4b1a6d1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 21:38:29 +0200 Subject: [PATCH 39/73] release 2014.07.20.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0214086d04..c680620593 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.07.20' +__version__ = '2014.07.20.1' From c13bf7c836e1befb28070fe393e474566a43409a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 23:20:15 +0200 Subject: [PATCH 40/73] [swfinterp] Use helper function struct_unpack for old Python 2.x releases (#3270) --- youtube_dl/swfinterp.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index 79d86152d7..87ec7bcffe 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -2,12 +2,12 @@ from __future__ import unicode_literals import collections import io -import struct import zlib from .utils import ( compat_str, ExtractorError, + struct_unpack, ) @@ -23,17 +23,17 @@ def _extract_tags(file_contents): file_contents[:1]) # Determine number of bits in framesize rectangle - framesize_nbits = struct.unpack('!B', content[:1])[0] >> 3 + framesize_nbits = struct_unpack('!B', content[:1])[0] >> 3 framesize_len = (5 + 4 * framesize_nbits + 7) // 8 pos = framesize_len + 2 + 2 while pos < len(content): - header16 = struct.unpack('<H', content[pos:pos + 2])[0] + header16 = struct_unpack('<H', content[pos:pos + 2])[0] pos += 2 tag_code = header16 >> 6 tag_len = header16 & 0x3f if tag_len == 0x3f: - tag_len = struct.unpack('<I', content[pos:pos + 4])[0] + tag_len = struct_unpack('<I', content[pos:pos + 4])[0] pos += 4 assert pos + tag_len <= len(content), \ ('Tag %d ends at %d+%d - that\'s longer than the file (%d)' @@ -99,7 +99,7 @@ def _read_int(reader): for _ in range(5): buf = reader.read(1) assert len(buf) == 1 - b = struct.unpack('<B', buf)[0] + b = struct_unpack('<B', buf)[0] res = res | ((b & 0x7f) << shift) if b & 0x80 == 0: break @@ -125,7 +125,7 @@ def _s24(reader): bs = reader.read(3) assert len(bs) == 3 last_byte = b'\xff' if (ord(bs[2:3]) >= 0x80) else b'\x00' - return struct.unpack('<i', bs + last_byte)[0] + return struct_unpack('<i', bs + last_byte)[0] def _read_string(reader): @@ -144,7 +144,7 @@ def _read_bytes(count, reader): def _read_byte(reader): resb = _read_bytes(1, reader=reader) - res = struct.unpack('<B', resb)[0] + res = struct_unpack('<B', resb)[0] return res From 727d2930f25164bff9a066f78ff21643f6e70ec8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 23:23:01 +0200 Subject: [PATCH 41/73] release 2014.07.20.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c680620593..e2e0ee25c1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.07.20.1' +__version__ = '2014.07.20.2' From 72e785f36ad215d48915b9207c403c5ce54cc9da Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 23:34:20 +0200 Subject: [PATCH 42/73] [livestream] PEP8 --- youtube_dl/extractor/livestream.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 2c100d4246..1ea1bbab4d 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -28,11 +28,13 @@ class LivestreamIE(InfoExtractor): } def _extract_video_info(self, video_data): - video_url = video_data.get('progressive_url_hd') or video_data.get('progressive_url') + video_url = ( + video_data.get('progressive_url_hd') or + video_data.get('progressive_url') + ) return { 'id': compat_str(video_data['id']), 'url': video_url, - 'ext': 'mp4', 'title': video_data['caption'], 'thumbnail': video_data['thumbnail_url'], 'upload_date': video_data['updated_at'].replace('-', '')[:8], @@ -50,7 +52,8 @@ class LivestreamIE(InfoExtractor): r'window.config = ({.*?});', webpage, 'window config') info = json.loads(config_json)['event'] videos = [self._extract_video_info(video_data['data']) - for video_data in info['feed']['data'] if video_data['type'] == 'video'] + for video_data in info['feed']['data'] + if video_data['type'] == 'video'] return self.playlist_result(videos, info['id'], info['full_name']) else: og_video = self._og_search_video_url(webpage, 'player url') From 351f3738656899afa625ef3bdf640aeeacae00ff Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 23:36:21 +0200 Subject: [PATCH 43/73] [swfinterp] Fix _u32 name --- youtube_dl/swfinterp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index 87ec7bcffe..a6f4ba6e0f 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -111,7 +111,7 @@ def _u30(reader): res = _read_int(reader) assert res & 0xf0000000 == 0 return res -u32 = _read_int +_u32 = _read_int def _s32(reader): From 7fbf54dc62b43884d49d1d96854dc82a38b8b42f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 23:37:10 +0200 Subject: [PATCH 44/73] [swfinterp] Remove (at the moment) dead code --- youtube_dl/swfinterp.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index a6f4ba6e0f..b63c65b201 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -470,8 +470,7 @@ class SWFInterpreter(object): mname = self.multinames[index] assert isinstance(obj, _AVMClass) - construct_method = self.extract_function( - obj, mname) + # We do not actually call the constructor for now; # we just pretend it does nothing stack.append(obj.make_object()) From 246168bd72a8f031adb243e9465ef52f62fb502c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 20 Jul 2014 23:38:44 +0200 Subject: [PATCH 45/73] Remove unused imports --- youtube_dl/extractor/firedrive.py | 1 - youtube_dl/extractor/tenplay.py | 2 -- youtube_dl/extractor/youtube.py | 3 --- 3 files changed, 6 deletions(-) diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py index d26145db1c..6d73c8a4a3 100644 --- a/youtube_dl/extractor/firedrive.py +++ b/youtube_dl/extractor/firedrive.py @@ -8,7 +8,6 @@ from ..utils import ( ExtractorError, compat_urllib_parse, compat_urllib_request, - determine_ext, ) diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py index 8477840fc6..81ba169fbe 100644 --- a/youtube_dl/extractor/tenplay.py +++ b/youtube_dl/extractor/tenplay.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 071aa75195..072e711c2e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,15 +1,12 @@ # coding: utf-8 -import collections import errno import io import itertools import json import os.path import re -import struct import traceback -import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor From eef4a7a3042914e4cad6d46a90308567f012ae59 Mon Sep 17 00:00:00 2001 From: "Anthony J. Bentley" <anthony@cathet.us> Date: Sun, 20 Jul 2014 18:37:44 -0600 Subject: [PATCH 46/73] =?UTF-8?q?Fix=20typo:=20=E2=80=9Cytseach=E2=80=9D?= =?UTF-8?q?=20=E2=86=92=20=E2=80=9Cytsearch=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f97b598457..9db27f9aa3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -402,7 +402,7 @@ class GenericIE(InfoExtractor): elif default_search == 'error': raise ExtractorError( ('%r is not a valid URL. ' - 'Set --default-search "ytseach" (or run youtube-dl "ytsearch:%s" ) to search YouTube' + 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' ) % (url, url), expected=True) else: assert ':' in default_search From 9732d77ed273406afcf9ed3ccb4d109824c9c69d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 12:02:44 +0200 Subject: [PATCH 47/73] [snotr] PEP8 and minor fixes (#3296) --- youtube_dl/YoutubeDL.py | 4 +++ youtube_dl/extractor/common.py | 2 ++ youtube_dl/extractor/snotr.py | 55 +++++++++++++++------------------- youtube_dl/utils.py | 22 +++++++------- 4 files changed, 41 insertions(+), 42 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 3dff723b81..686988fe5b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1197,6 +1197,10 @@ class YoutubeDL(object): if res: res += ', ' res += format_bytes(fdict['filesize']) + elif fdict.get('filesize_approx') is not None: + if res: + res += ', ' + res += '~' + format_bytes(fdict['filesize_approx']) return res def list_formats(self, info_dict): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e68657314e..3213abacfc 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -69,6 +69,7 @@ class InfoExtractor(object): * vcodec Name of the video codec in use * container Name of the container format * filesize The number of bytes, if known in advance + * filesize_approx An estimate for the number of bytes * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. @@ -555,6 +556,7 @@ class InfoExtractor(object): f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, f.get('filesize') if f.get('filesize') is not None else -1, + f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, f.get('format_id'), ) formats.sort(key=_formats_key) diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py index f89e81bf32..e762ad8f6b 100644 --- a/youtube_dl/extractor/snotr.py +++ b/youtube_dl/extractor/snotr.py @@ -4,49 +4,39 @@ from __future__ import unicode_literals import re from .common import InfoExtractor - from ..utils import ( - + float_or_none, str_to_int, - parse_iso8601, - - - + parse_duration, ) + class SnotrIE(InfoExtractor): _VALID_URL = r'http?://(?:www\.)?snotr\.com/video/(?P<id>\d+)/([\w]+)' - _TESTS =[ { + _TESTS = [{ 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', 'info_dict': { 'id': '13708', 'ext': 'flv', 'title': 'Drone flying through fireworks!', 'duration': 247, - 'filesize':12320768 - } - }, - - - - { - + 'filesize_approx': 98566144, + } + }, { 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', 'info_dict': { 'id': '530', 'ext': 'flv', 'title': 'David Letteman - George W. Bush Top 10', 'duration': 126, - 'filesize': 1048576 - } - }] - + 'filesize_approx': 8912896, + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - # TODO more code goes here, for example ... webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) @@ -54,20 +44,23 @@ class SnotrIE(InfoExtractor): video_url = "http://cdn.videos.snotr.com/%s.flv" % video_id - view_count = str_to_int(self._html_search_regex(r'<p>\n<strong>Views:</strong>\n([\d,\.]+)</p>',webpage,'view count')) + view_count = str_to_int(self._html_search_regex( + r'<p>\n<strong>Views:</strong>\n([\d,\.]+)</p>', + webpage, 'view count', fatal=False)) - duration = self._html_search_regex(r'<p>\n<strong>Length:</strong>\n(.*?)</p>',webpage,'duration') - duration = str_to_int(duration[:1])*60 + str_to_int(duration[2:4]) + duration = parse_duration(self._html_search_regex( + r'<p>\n<strong>Length:</strong>\n\s*([0-9:]+).*?</p>', + webpage, 'duration', fatal=False)) - file_size = self._html_search_regex(r'<p>\n<strong>Filesize:</strong>\n(.*?)</p>',webpage,'filesize') - file_size = str_to_int(re.match(r'\d+',file_size).group())*131072 + filesize_approx = float_or_none(self._html_search_regex( + r'<p>\n<strong>Filesize:</strong>\n\s*([0-9.]+)\s*megabyte</p>', + webpage, 'filesize', fatal=False), invscale=1024 * 1024) return { 'id': video_id, 'title': title, - 'url':video_url, - 'view_count':view_count, - 'duration':duration, - 'filesize':file_size - - } \ No newline at end of file + 'url': video_url, + 'view_count': view_count, + 'duration': duration, + 'filesize_approx': filesize_approx, + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 919603c623..bf4d1112f9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1193,13 +1193,6 @@ def format_bytes(bytes): return u'%.2f%s' % (converted, suffix) -def str_to_int(int_str): - if int_str is None: - return None - int_str = re.sub(r'[,\.]', u'', int_str) - return int(int_str) - - def get_term_width(): columns = os.environ.get('COLUMNS', None) if columns: @@ -1267,15 +1260,22 @@ class HEADRequest(compat_urllib_request.Request): return "HEAD" -def int_or_none(v, scale=1, default=None, get_attr=None): +def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr: if v is not None: v = getattr(v, get_attr, None) - return default if v is None else (int(v) // scale) + return default if v is None else (int(v) * invscale // scale) -def float_or_none(v, scale=1, default=None): - return default if v is None else (float(v) / scale) +def str_to_int(int_str): + if int_str is None: + return None + int_str = re.sub(r'[,\.]', u'', int_str) + return int(int_str) + + +def float_or_none(v, scale=1, invscale=1, default=None): + return default if v is None else (float(v) * invscale / scale) def parse_duration(s): From 54330a1c3c3d4f3c4ce520e0deeece68120c3051 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 12:07:26 +0200 Subject: [PATCH 48/73] [swfinterp] Fix imports --- test/test_swfinterp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_swfinterp.py b/test/test_swfinterp.py index 3bb5a6308c..b42cd74c73 100644 --- a/test/test_swfinterp.py +++ b/test/test_swfinterp.py @@ -7,6 +7,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import errno import io import json import re From da8fb85859964d9a1d21a0328eb9044e19499d9c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 12:08:44 +0200 Subject: [PATCH 49/73] [snotr] Add description --- youtube_dl/extractor/snotr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py index e762ad8f6b..da3b05a8dc 100644 --- a/youtube_dl/extractor/snotr.py +++ b/youtube_dl/extractor/snotr.py @@ -21,6 +21,7 @@ class SnotrIE(InfoExtractor): 'title': 'Drone flying through fireworks!', 'duration': 247, 'filesize_approx': 98566144, + 'description': 'A drone flying through Fourth of July Fireworks', } }, { 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', @@ -30,6 +31,7 @@ class SnotrIE(InfoExtractor): 'title': 'David Letteman - George W. Bush Top 10', 'duration': 126, 'filesize_approx': 8912896, + 'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!', } }] @@ -41,7 +43,6 @@ class SnotrIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage) - video_url = "http://cdn.videos.snotr.com/%s.flv" % video_id view_count = str_to_int(self._html_search_regex( @@ -58,6 +59,7 @@ class SnotrIE(InfoExtractor): return { 'id': video_id, + 'description': description, 'title': title, 'url': video_url, 'view_count': view_count, From db964a33a1c8ec0449fe2e39cf8d5de70daaffc2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 12:12:50 +0200 Subject: [PATCH 50/73] Remove unused imports --- youtube_dl/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f223b75f4b..0e7b9ddaff 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -72,11 +72,9 @@ __license__ = 'Public Domain' import codecs import io -import locale import optparse import os import random -import re import shlex import sys From 9aeaf730ad712aed29d241d4e6655b8e5fee1d47 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 12:14:06 +0200 Subject: [PATCH 51/73] [rtve] Fix md5sum Looks like these guys reencoded the video. --- youtube_dl/extractor/rtve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 77fd08ddec..c2228b2f0f 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -17,7 +17,7 @@ class RTVEALaCartaIE(InfoExtractor): _TEST = { 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', - 'md5': '18fcd45965bdd076efdb12cd7f6d7b9e', + 'md5': '1d49b7e1ca7a7502c56a4bf1b60f1b43', 'info_dict': { 'id': '2491869', 'ext': 'mp4', From 468d19a9c15f1a3ddd5363c4a966667d777782b0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 12:15:23 +0200 Subject: [PATCH 52/73] [savefrom] Fix test description --- youtube_dl/extractor/savefrom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/savefrom.py b/youtube_dl/extractor/savefrom.py index 198a08c1c9..ccd545971f 100644 --- a/youtube_dl/extractor/savefrom.py +++ b/youtube_dl/extractor/savefrom.py @@ -20,7 +20,7 @@ class SaveFromIE(InfoExtractor): 'upload_date': '20120816', 'uploader': 'Howcast', 'uploader_id': 'Howcast', - 'description': 'md5:4f0aac94361a12e1ce57d74f85265175', + 'description': 'md5:727900f130df3dc9a25e2721497c7910', }, 'params': { 'skip_download': True From 4f95d455edf20583abd85801c23e88fa749be237 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 12:17:44 +0200 Subject: [PATCH 53/73] [steam] Update test description --- youtube_dl/extractor/steam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index af689e2c20..183dcb03cc 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -53,7 +53,7 @@ class SteamIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20140329', 'title': 'FRONTIERS - Final Greenlight Trailer', - 'description': 'md5:6df4fe8dd494ae811869672b0767e025', + 'description': 'md5:dc96a773669d0ca1b36c13c1f30250d9', 'uploader': 'AAD Productions', 'uploader_id': 'AtomicAgeDogGames', } From d8624e6a80751c09a48ff6b9db1d4d85e377c437 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 12:25:49 +0200 Subject: [PATCH 54/73] [test_playlist] Add and use assertGreaterEqual --- test/helper.py | 7 +++++++ test/test_playlists.py | 47 +++++++++++++++++++++--------------------- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/test/helper.py b/test/helper.py index 230d2bd67a..84b16f770e 100644 --- a/test/helper.py +++ b/test/helper.py @@ -148,3 +148,10 @@ def assertRegexpMatches(self, text, regexp, msg=None): else: msg = note + ', ' + msg self.assertTrue(m, msg) + + +def assertGreaterEqual(self, got, expected, msg=None): + if not (got >= expected): + if msg is None: + msg = '%r not greater than or equal to %r' % (got, expected) + self.assertTrue(got >= expected, msg) diff --git a/test/test_playlists.py b/test/test_playlists.py index 1a38a667b1..4789200e9f 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import ( assertRegexpMatches, + assertGreaterEqual, expect_info_dict, FakeYDL, ) @@ -71,8 +72,8 @@ class TestPlaylists(unittest.TestCase): ie = DailymotionUserIE(dl) result = ie.extract('https://www.dailymotion.com/user/nqtv') self.assertIsPlaylist(result) + assertGreaterEqual(self, len(result['entries']), 100) self.assertEqual(result['title'], 'Rémi Gaillard') - self.assertTrue(len(result['entries']) >= 100) def test_vimeo_channel(self): dl = FakeYDL() @@ -111,7 +112,7 @@ class TestPlaylists(unittest.TestCase): ie = VineUserIE(dl) result = ie.extract('https://vine.co/Visa') self.assertIsPlaylist(result) - self.assertTrue(len(result['entries']) >= 47) + assertGreaterEqual(self, len(result['entries']), 47) def test_ustream_channel(self): dl = FakeYDL() @@ -119,7 +120,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://www.ustream.tv/channel/channeljapan') self.assertIsPlaylist(result) self.assertEqual(result['id'], '10874166') - self.assertTrue(len(result['entries']) >= 54) + assertGreaterEqual(self, len(result['entries']), 54) def test_soundcloud_set(self): dl = FakeYDL() @@ -127,7 +128,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'The Royal Concept EP') - self.assertTrue(len(result['entries']) >= 6) + assertGreaterEqual(self, len(result['entries']), 6) def test_soundcloud_user(self): dl = FakeYDL() @@ -135,7 +136,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('https://soundcloud.com/the-concept-band') self.assertIsPlaylist(result) self.assertEqual(result['id'], '9615865') - self.assertTrue(len(result['entries']) >= 12) + assertGreaterEqual(self, len(result['entries']), 12) def test_soundcloud_likes(self): dl = FakeYDL() @@ -143,7 +144,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('https://soundcloud.com/the-concept-band/likes') self.assertIsPlaylist(result) self.assertEqual(result['id'], '9615865') - self.assertTrue(len(result['entries']) >= 1) + assertGreaterEqual(self, len(result['entries']), 1) def test_soundcloud_playlist(self): dl = FakeYDL() @@ -162,7 +163,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://new.livestream.com/tedx/cityenglish') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'TEDCity2.0 (English)') - self.assertTrue(len(result['entries']) >= 4) + assertGreaterEqual(self, len(result['entries']), 4) def test_livestreamoriginal_folder(self): dl = FakeYDL() @@ -170,7 +171,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('https://www.livestream.com/newplay/folder?dirId=a07bf706-d0e4-4e75-a747-b021d84f2fd3') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'a07bf706-d0e4-4e75-a747-b021d84f2fd3') - self.assertTrue(len(result['entries']) >= 28) + assertGreaterEqual(self, len(result['entries']), 28) def test_nhl_videocenter(self): dl = FakeYDL() @@ -187,7 +188,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://bambuser.com/channel/pixelversity') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'pixelversity') - self.assertTrue(len(result['entries']) >= 60) + assertGreaterEqual(self, len(result['entries']), 60) def test_bandcamp_album(self): dl = FakeYDL() @@ -195,7 +196,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://mpallante.bandcamp.com/album/nightmare-night-ep') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'Nightmare Night EP') - self.assertTrue(len(result['entries']) >= 4) + assertGreaterEqual(self, len(result['entries']), 4) def test_smotri_community(self): dl = FakeYDL() @@ -204,7 +205,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'kommuna') self.assertEqual(result['title'], 'КПРФ') - self.assertTrue(len(result['entries']) >= 4) + assertGreaterEqual(self, len(result['entries']), 4) def test_smotri_user(self): dl = FakeYDL() @@ -213,7 +214,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'inspector') self.assertEqual(result['title'], 'Inspector') - self.assertTrue(len(result['entries']) >= 9) + assertGreaterEqual(self, len(result['entries']), 9) def test_AcademicEarthCourse(self): dl = FakeYDL() @@ -232,7 +233,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'dvoe_iz_lartsa') self.assertEqual(result['title'], 'Двое из ларца (2006 - 2008)') - self.assertTrue(len(result['entries']) >= 24) + assertGreaterEqual(self, len(result['entries']), 24) def test_ivi_compilation_season(self): dl = FakeYDL() @@ -241,7 +242,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'dvoe_iz_lartsa/season1') self.assertEqual(result['title'], 'Двое из ларца (2006 - 2008) 1 сезон') - self.assertTrue(len(result['entries']) >= 12) + assertGreaterEqual(self, len(result['entries']), 12) def test_imdb_list(self): dl = FakeYDL() @@ -260,7 +261,7 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['id'], 'cryptography') self.assertEqual(result['title'], 'Journey into cryptography') self.assertEqual(result['description'], 'How have humans protected their secret messages through history? What has changed today?') - self.assertTrue(len(result['entries']) >= 3) + assertGreaterEqual(self, len(result['entries']), 3) def test_EveryonesMixtape(self): dl = FakeYDL() @@ -277,7 +278,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://rutube.ru/tags/video/1800/') self.assertIsPlaylist(result) self.assertEqual(result['id'], '1800') - self.assertTrue(len(result['entries']) >= 68) + assertGreaterEqual(self, len(result['entries']), 68) def test_rutube_person(self): dl = FakeYDL() @@ -285,7 +286,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://rutube.ru/video/person/313878/') self.assertIsPlaylist(result) self.assertEqual(result['id'], '313878') - self.assertTrue(len(result['entries']) >= 37) + assertGreaterEqual(self, len(result['entries']), 37) def test_multiple_brightcove_videos(self): # https://github.com/rg3/youtube-dl/issues/2283 @@ -322,7 +323,7 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], '10') self.assertEqual(result['title'], 'Who are the hackers?') - self.assertTrue(len(result['entries']) >= 6) + assertGreaterEqual(self, len(result['entries']), 6) def test_toypics_user(self): dl = FakeYDL() @@ -330,7 +331,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://videos.toypics.net/Mikey') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'Mikey') - self.assertTrue(len(result['entries']) >= 17) + assertGreaterEqual(self, len(result['entries']), 17) def test_xtube_user(self): dl = FakeYDL() @@ -338,7 +339,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://www.xtube.com/community/profile.php?user=greenshowers') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'greenshowers') - self.assertTrue(len(result['entries']) >= 155) + assertGreaterEqual(self, len(result['entries']), 155) def test_InstagramUser(self): dl = FakeYDL() @@ -346,7 +347,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://instagram.com/porsche') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'porsche') - self.assertTrue(len(result['entries']) >= 2) + assertGreaterEqual(self, len(result['entries']), 2) test_video = next( e for e in result['entries'] if e['id'] == '614605558512799803_462752227') @@ -385,7 +386,7 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['id'], '152147') self.assertEqual( result['title'], 'Brace Yourself - Today\'s Weirdest News') - self.assertTrue(len(result['entries']) >= 10) + assertGreaterEqual(self, len(result['entries']), 10) def test_TeacherTubeUser(self): dl = FakeYDL() @@ -393,7 +394,7 @@ class TestPlaylists(unittest.TestCase): result = ie.extract('http://www.teachertube.com/user/profile/rbhagwati2') self.assertIsPlaylist(result) self.assertEqual(result['id'], 'rbhagwati2') - self.assertTrue(len(result['entries']) >= 179) + assertGreaterEqual(self, len(result['entries']), 179) if __name__ == '__main__': unittest.main() From 1a30deca50d6256bb833aee672d5055d72319aca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 12:47:01 +0200 Subject: [PATCH 55/73] [teachertube] Fix title and playlist recognition --- youtube_dl/extractor/common.py | 2 +- youtube_dl/extractor/teachertube.py | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3213abacfc..9b36e07891 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -469,7 +469,7 @@ class InfoExtractor(object): display_name = name return self._html_search_regex( r'''(?ix)<meta - (?=[^>]+(?:itemprop|name|property)=["\']%s["\']) + (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?) [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), html, display_name, fatal=fatal, **kwargs) diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index 2c2113b140..46d727d1de 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -62,7 +62,7 @@ class TeacherTubeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_search_meta('title', webpage, 'title') + title = self._html_search_meta('title', webpage, 'title', fatal=True) TITLE_SUFFIX = ' - TeacherTube' if title.endswith(TITLE_SUFFIX): title = title[:-len(TITLE_SUFFIX)].strip() @@ -101,7 +101,11 @@ class TeacherTubeUserIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(user/profile|collection)/(?P<user>[0-9a-zA-Z]+)/?' - _MEDIA_RE = r'(?s)"sidebar_thumb_time">[0-9:]+</div>.+?<a href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)">' + _MEDIA_RE = r'''(?sx) + class="?sidebar_thumb_time"?>[0-9:]+</div> + \s* + <a\s+href="(https?://(?:www\.)?teachertube\.com/(?:video|audio)/[^"]+)" + ''' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -111,14 +115,12 @@ class TeacherTubeUserIE(InfoExtractor): webpage = self._download_webpage(url, user_id) urls.extend(re.findall(self._MEDIA_RE, webpage)) - pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[1:-1] + pages = re.findall(r'/ajax-user/user-videos/%s\?page=([0-9]+)' % user_id, webpage)[:-1] for p in pages: more = 'http://www.teachertube.com/ajax-user/user-videos/%s?page=%s' % (user_id, p) - webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages) + 1)) - urls.extend(re.findall(self._MEDIA_RE, webpage)) - - entries = [] - for url in urls: - entries.append(self.url_result(url, 'TeacherTube')) + webpage = self._download_webpage(more, user_id, 'Downloading page %s/%s' % (p, len(pages))) + video_urls = re.findall(self._MEDIA_RE, webpage) + urls.extend(video_urls) + entries = [self.url_result(vurl, 'TeacherTube') for vurl in urls] return self.playlist_result(entries, user_id) From 264a7044f59373291bae6a662b9173d55a4b9925 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 12:57:40 +0200 Subject: [PATCH 56/73] [dropbox] Fix test and add support for spaces in filenames --- youtube_dl/extractor/dropbox.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 41208c9769..1711f0263b 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -5,24 +5,26 @@ import os.path import re from .common import InfoExtractor +from ..utils import compat_urllib_parse class DropboxIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/s/(?P<id>[a-zA-Z0-9]{15})/(?P<title>[^?#]*)' _TEST = { - 'url': 'https://www.dropbox.com/s/0qr9sai2veej4f8/THE_DOCTOR_GAMES.mp4', - 'md5': '8ae17c51172fb7f93bdd6a214cc8c896', + 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4', + 'md5': '8a3d905427a6951ccb9eb292f154530b', 'info_dict': { - 'id': '0qr9sai2veej4f8', + 'id': 'nelirfsxnmcfbfh', 'ext': 'mp4', - 'title': 'THE_DOCTOR_GAMES' + 'title': 'youtube-dl test video \'ä"BaW_jenozKc' } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - title = os.path.splitext(mobj.group('title'))[0] + fn = compat_urllib_parse.unquote(mobj.group('title')) + title = os.path.splitext(fn)[0] video_url = url + '?dl=1' return { From 6f5342a201e2568ce91454d96281165c39dae16e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:03:18 +0200 Subject: [PATCH 57/73] [cnet] Fix title extraction URLs are still missing --- youtube_dl/extractor/cnet.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index a94f425717..710d5009b7 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -43,7 +43,11 @@ class CNETIE(InfoExtractor): raise ExtractorError('Cannot find video data') video_id = vdata['id'] - title = vdata['headline'] + title = vdata.get('headline') + if title is None: + title = vdata.get('title') + if title is None: + raise ExtractorError('Cannot find title!') description = vdata.get('dek') thumbnail = vdata.get('image', {}).get('path') author = vdata.get('author') From 0e6ebc13d154e6b8b063dfe7e9c2dd28d427fb77 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:11:24 +0200 Subject: [PATCH 58/73] [vimeo] Update test description --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 255855558c..a3c6e83b01 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -98,7 +98,7 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): 'info_dict': { 'id': '54469442', 'ext': 'mp4', - 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software', + 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012', 'uploader': 'The BLN & Business of Software', 'uploader_id': 'theblnbusinessofsoftware', 'duration': 3610, From a850fde1d82d86ed5b75e6f7e1f2e43817946290 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:14:41 +0200 Subject: [PATCH 59/73] [funnyordie] Fix test description --- youtube_dl/extractor/funnyordie.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 6e6b666600..721e5fce01 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -26,7 +26,7 @@ class FunnyOrDieIE(InfoExtractor): 'id': 'e402820827', 'ext': 'mp4', 'title': 'Please Use This Song (Jon Lajoie)', - 'description': 'md5:2ed27d364f5a805a6dba199faaf6681d', + 'description': 'Please use this to sell something. www.jonlajoie.com', 'thumbnail': 're:^http:.*\.jpg$', }, }] From caf5a8817bc53e6d799c70c71d7ca03568738620 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:16:48 +0200 Subject: [PATCH 60/73] [chilloutzone] Fix test description --- youtube_dl/extractor/chilloutzone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index 02d5ba5271..a62395d4b7 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -42,7 +42,7 @@ class ChilloutzoneIE(InfoExtractor): 'id': '85523671', 'ext': 'mp4', 'title': 'The Sunday Times - Icons', - 'description': 'md5:3e1c0dc6047498d6728dcdaad0891762', + 'description': 'md5:a5f7ff82e2f7a9ed77473fe666954e84', 'uploader': 'Us', 'uploader_id': 'usfilms', 'upload_date': '20140131' From ff1956e07b64735a37de886fa2e800dd823bb3e1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:19:41 +0200 Subject: [PATCH 61/73] [wdr] Replace test case --- youtube_dl/extractor/wdr.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index f741ba5400..ab28ef6fe4 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -55,14 +55,14 @@ class WDRIE(InfoExtractor): }, }, { - 'url': 'http://www.funkhauseuropa.de/av/audiosuepersongsoulbossanova100-audioplayer.html', - 'md5': '24e83813e832badb0a8d7d1ef9ef0691', + 'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html', + 'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa', 'info_dict': { - 'id': 'mdb-463528', + 'id': 'mdb-478135', 'ext': 'mp3', - 'title': 'Süpersong: Soul Bossa Nova', + 'title': 'Flavia Coelho: Amar é Amar', 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a', - 'upload_date': '20140630', + 'upload_date': '20140717', }, }, ] From df8ba0d2cf9ea0ae1fde4c9f76a12f315e88aef3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:22:14 +0200 Subject: [PATCH 62/73] [tagesschau] Remove test case See http://de.wikipedia.org/wiki/Depublizieren for the sad rationale. --- youtube_dl/extractor/tagesschau.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 25b9864add..b870474515 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -19,16 +19,6 @@ class TagesschauIE(InfoExtractor): 'description': 'md5:69da3c61275b426426d711bde96463ab', 'thumbnail': 're:^http:.*\.jpg$', }, - }, { - 'url': 'http://www.tagesschau.de/multimedia/video/video-5964.html', - 'md5': '66652566900963a3f962333579eeffcf', - 'info_dict': { - 'id': '5964', - 'ext': 'mp4', - 'title': 'Nahost-Konflikt: Israel bombadiert Ziele im Gazastreifen und Westjordanland', - 'description': 'md5:07bfc78c48eec3145ed4805299a1900a', - 'thumbnail': 're:http://.*\.jpg', - }, }] _FORMATS = { From 06c155420fda2a922a7219dd6758f42b868e6d96 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:25:59 +0200 Subject: [PATCH 63/73] [sockshare] Simplify (#3268) --- youtube_dl/extractor/sockshare.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py index cbf2d7abef..75b634bc6b 100644 --- a/youtube_dl/extractor/sockshare.py +++ b/youtube_dl/extractor/sockshare.py @@ -5,7 +5,6 @@ from ..utils import ( ExtractorError, compat_urllib_parse, compat_urllib_request, - determine_ext, ) import re @@ -34,7 +33,7 @@ class SockshareIE(InfoExtractor): webpage = self._download_webpage(url, video_id) if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError(u'Video %s does not exist' % video_id, + raise ExtractorError('Video %s does not exist' % video_id, expected=True) confirm_hash = self._html_search_regex(r'''(?x)<input\s+ @@ -54,19 +53,21 @@ class SockshareIE(InfoExtractor): req.add_header('Host', 'www.sockshare.com') req.add_header('Content-type', 'application/x-www-form-urlencoded') - webpage = self._download_webpage(req, video_id, 'Downloading video page') + webpage = self._download_webpage( + req, video_id, 'Downloading video page') - video_url = self._html_search_regex(r'<a href="([^"]*)".+class="download_file_link"', webpage, 'file url') + video_url = self._html_search_regex( + r'<a href="([^"]*)".+class="download_file_link"', + webpage, 'file url') video_url = "http://www.sockshare.com" + video_url title = self._html_search_regex(r'<h1>(.+)<strong>', webpage, 'title') - thumbnail = self._html_search_regex(r'<img\ssrc="([^"]*)".+name="bg"', - webpage, 'thumbnail') - ext = determine_ext(title) + thumbnail = self._html_search_regex( + r'<img\s+src="([^"]*)".+?name="bg"', + webpage, 'thumbnail') formats = [{ 'format_id': 'sd', 'url': video_url, - 'ext': ext, }] return { From f1f725c6a0e567283704046fc21614f4826e77fd Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 13:55:47 +0200 Subject: [PATCH 64/73] [dropbox] Fix title encoding on Python 2 --- youtube_dl/extractor/dropbox.py | 4 ++-- youtube_dl/utils.py | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 1711f0263b..9f569aa932 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -5,7 +5,7 @@ import os.path import re from .common import InfoExtractor -from ..utils import compat_urllib_parse +from ..utils import compat_urllib_parse_unquote class DropboxIE(InfoExtractor): @@ -23,7 +23,7 @@ class DropboxIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - fn = compat_urllib_parse.unquote(mobj.group('title')) + fn = compat_urllib_parse_unquote(mobj.group('title')) title = os.path.splitext(fn)[0] video_url = url + '?dl=1' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bf4d1112f9..3ecd798d74 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -91,11 +91,9 @@ except ImportError: compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - def _unquote(string, encoding='utf-8', errors='replace'): + from urllib.parse import unquote as compat_urllib_parse_unquote +except ImportError: + def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): if string == '': return string res = string.split('%') @@ -130,6 +128,13 @@ except ImportError: # Python 2 string += pct_sequence.decode(encoding, errors) return string + +try: + from urllib.parse import parse_qs as compat_parse_qs +except ImportError: # Python 2 + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace'): qs, _coerce_result = qs, unicode @@ -149,10 +154,12 @@ except ImportError: # Python 2 continue if len(nv[1]) or keep_blank_values: name = nv[0].replace('+', ' ') - name = _unquote(name, encoding=encoding, errors=errors) + name = compat_urllib_parse_unquote( + name, encoding=encoding, errors=errors) name = _coerce_result(name) value = nv[1].replace('+', ' ') - value = _unquote(value, encoding=encoding, errors=errors) + value = compat_urllib_parse_unquote( + value, encoding=encoding, errors=errors) value = _coerce_result(value) r.append((name, value)) return r From 754d8a035e3e1d0f3340aef662ae0df76a0be91d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 18:06:21 +0200 Subject: [PATCH 65/73] [nbcnews] Look in all playlists for video --- youtube_dl/extractor/nbc.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index aa34665d16..70aa98aee3 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -85,11 +85,25 @@ class NBCNewsIE(InfoExtractor): flags=re.MULTILINE) bootstrap = json.loads(bootstrap_json) info = bootstrap['results'][0]['video'] - playlist_url = info['fallbackPlaylistUrl'] + '?form=MPXNBCNewsAPI' mpxid = info['mpxId'] - all_videos = self._download_json(playlist_url, title)['videos'] - # The response contains additional videos - info = next(v for v in all_videos if v['mpxId'] == mpxid) + + base_urls = [ + info['fallbackPlaylistUrl'], + info['associatedPlaylistUrl'], + ] + + for base_url in base_urls: + playlist_url = base_url + '?form=MPXNBCNewsAPI' + all_videos = self._download_json(playlist_url, title)['videos'] + + try: + info = next(v for v in all_videos if v['mpxId'] == mpxid) + break + except StopIteration: + continue + + if info is None: + raise ExtractorError('Could not find video in playlists') return { '_type': 'url', From 1e8ac8364b09259b5bfa277304f5fb31906b8801 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 21 Jul 2014 18:06:51 +0200 Subject: [PATCH 66/73] release 2014.07.21 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e2e0ee25c1..0ce4a6c107 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.07.20.2' +__version__ = '2014.07.21' From 9dcb8f3fc7927a6a3e6f4747f64c6f8c3900cdc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 21 Jul 2014 20:42:20 +0200 Subject: [PATCH 67/73] [br] Allow '_' in the url (fixes #3311) --- youtube_dl/extractor/br.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 993360714b..f7f2f713a5 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -12,7 +12,7 @@ from ..utils import ( class BRIE(InfoExtractor): IE_DESC = 'Bayerischer Rundfunk Mediathek' - _VALID_URL = r'https?://(?:www\.)?br\.de/(?:[a-z0-9\-]+/)+(?P<id>[a-z0-9\-]+)\.html' + _VALID_URL = r'https?://(?:www\.)?br\.de/(?:[a-z0-9\-_]+/)+(?P<id>[a-z0-9\-_]+)\.html' _BASE_URL = 'http://www.br.de' _TESTS = [ From 53eb217661ee95fc14d3f801118b2d69070d0bbe Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 22 Jul 2014 04:53:06 +0200 Subject: [PATCH 68/73] Add another great example for the --extractor-descriptions output --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 0e7b9ddaff..c6a5b2b5b4 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -633,7 +633,7 @@ def _real_main(argv=None): if desc is False: continue if hasattr(ie, 'SEARCH_KEY'): - _SEARCHES = (u'cute kittens', u'slithering pythons', u'falling cat', u'angry poodle', u'purple fish', u'running tortoise') + _SEARCHES = (u'cute kittens', u'slithering pythons', u'falling cat', u'angry poodle', u'purple fish', u'running tortoise', u'sleeping bunny') _COUNTS = (u'', u'5', u'10', u'all') desc += u' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) compat_print(desc) From 8904e979dfe489e37bda369a5863bceecb56d490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 22 Jul 2014 20:37:33 +0700 Subject: [PATCH 69/73] [vodlocker] Fix _VALID_URL --- youtube_dl/extractor/vodlocker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index 68c59364bc..6d3b78749e 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -10,7 +10,7 @@ from ..utils import ( class VodlockerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vodlocker.com/(?P<id>[0-9a-zA-Z]+)(?:\..*?)?' + _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?P<id>[0-9a-zA-Z]+)(?:\..*?)?' _TESTS = [{ 'url': 'http://vodlocker.com/e8wvyzz4sl42', From e00fc35dbe7cdba20d78ccbf7a2fb471d5356529 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 22 Jul 2014 15:52:01 +0200 Subject: [PATCH 70/73] [kickstarter] Support embedded videos (Fixes #3322) --- youtube_dl/extractor/kickstarter.py | 37 +++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py index 961dd1aa64..56a76380ca 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/youtube_dl/extractor/kickstarter.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class KickStarterIE(InfoExtractor): _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>[^/]*)/.*' - _TEST = { + _TESTS = [{ 'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location', 'md5': 'c81addca81327ffa66c642b5d8b08cab', 'info_dict': { @@ -18,22 +18,45 @@ class KickStarterIE(InfoExtractor): 'description': 'A unique motocross documentary that examines the ' 'life and mind of one of sports most elite athletes: Josh Grant.', }, - } + }, { + 'note': 'Embedded video (not using the native kickstarter video service)', + 'url': 'https://www.kickstarter.com/projects/597507018/pebble-e-paper-watch-for-iphone-and-android/posts/659178', + 'playlist': [ + { + 'info_dict': { + 'id': '78704821', + 'ext': 'mp4', + 'uploader_id': 'pebble', + 'uploader': 'Pebble Technology', + 'title': 'Pebble iOS Notifications', + } + } + ], + }] def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'data-video-url="(.*?)"', - webpage, 'video URL') - video_title = self._html_search_regex(r'<title>(.*?)', - webpage, 'title').rpartition('— Kickstarter')[0].strip() + title = self._html_search_regex( + r'\s*(.*?)(?:\s*— Kickstarter)?\s*', + webpage, 'title') + video_url = self._search_regex( + r'data-video-url="(.*?)"', + webpage, 'video URL', default=None) + if video_url is None: # No native kickstarter, look for embedded videos + return { + '_type': 'url_transparent', + 'ie_key': 'Generic', + 'url': url, + 'title': title, + } return { 'id': video_id, 'url': video_url, - 'title': video_title, + 'title': title, 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), } From 1771ddd85db7acda1e4174ccd186acd40a881fbc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 22 Jul 2014 16:59:40 +0200 Subject: [PATCH 71/73] release 2014.07.22 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0ce4a6c107..e5fcec839a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.07.21' +__version__ = '2014.07.22' From 2871d489a91b6de1a5849243e4d827123dd564ef Mon Sep 17 00:00:00 2001 From: Jason Terk Date: Tue, 22 Jul 2014 07:56:42 -0700 Subject: [PATCH 72/73] Support Alternative cbs.com URL Format Adds support for cbs.com URLs containing "/artist" instead of "/video". E.g.: http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/ --- youtube_dl/extractor/cbs.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index ac03158536..44d23aef63 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -4,9 +4,9 @@ from .common import InfoExtractor class CBSIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/video/(?P[^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(video|artist)/(?P[^/]+)/.*' - _TEST = { + _TESTS = [{ u'url': u'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', u'file': u'4JUVEwq3wUT7.flv', u'info_dict': { @@ -18,7 +18,19 @@ class CBSIE(InfoExtractor): # rtmp download u'skip_download': True, }, - } + }, { + u'url': u'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/', + u'file': u'P9gjWjelt6iP.flv', + u'info_dict': { + u'title': u'Live on Letterman - St. Vincent', + u'description': u'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.', + u'duration': 3221, + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From e42a692f003eabdb1efad7b9f4b10ce97c712d32 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 22 Jul 2014 17:34:34 +0200 Subject: [PATCH 73/73] [cbs] Modernize Also add threatening skip blocks in there - access is only possible from the US. We may want to find a better geolocation restriction method for tests. --- youtube_dl/extractor/cbs.py | 42 +++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 44d23aef63..822f9a7be1 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,35 +1,41 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor class CBSIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(video|artist)/(?P[^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)/(?P[^/]+)/.*' _TESTS = [{ - u'url': u'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', - u'file': u'4JUVEwq3wUT7.flv', - u'info_dict': { - u'title': u'Connect Chat feat. Garth Brooks', - u'description': u'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', - u'duration': 1495, + 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', + 'info_dict': { + 'id': '4JUVEwq3wUT7', + 'ext': 'flv', + 'title': 'Connect Chat feat. Garth Brooks', + 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', + 'duration': 1495, }, - u'params': { + 'params': { # rtmp download - u'skip_download': True, + 'skip_download': True, }, + '_skip': 'Blocked outside the US', }, { - u'url': u'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/', - u'file': u'P9gjWjelt6iP.flv', - u'info_dict': { - u'title': u'Live on Letterman - St. Vincent', - u'description': u'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.', - u'duration': 3221, + 'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/', + 'info_dict': { + 'id': 'P9gjWjelt6iP', + 'ext': 'flv', + 'title': 'Live on Letterman - St. Vincent', + 'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.', + 'duration': 3221, }, - u'params': { + 'params': { # rtmp download - u'skip_download': True, + 'skip_download': True, }, + '_skip': 'Blocked outside the US', }] def _real_extract(self, url): @@ -38,5 +44,5 @@ class CBSIE(InfoExtractor): webpage = self._download_webpage(url, video_id) real_id = self._search_regex( r"video\.settings\.pid\s*=\s*'([^']+)';", - webpage, u'real video ID') + webpage, 'real video ID') return self.url_result(u'theplatform:%s' % real_id)