[googledrive] Fix extraction on Python 3.6

Since Python 3.6, invalid escape sequences are deprecated. It's likely
that there are invalid escape sequences somewhere on the webpage, so
instead of unescaping the whole webpage, just unescape the URL.

See https://bugs.python.org/issue27364. That change was designed for
string literals, while it affects the 'unicode_escape' encoding as well.
The code path is:

str.decode('unicode_escape')
    codecs.unicode_escape_decode()
        PyUnicode_DecodeUnicodeEscape()
This commit is contained in:
Yen Chi Hsuan 2017-02-05 21:41:08 +08:00
parent 6ef3e65a7b
commit e4e50f60b1
No known key found for this signature in database
GPG Key ID: 7F902A182457CA23
2 changed files with 11 additions and 4 deletions

View File

@ -1,3 +1,9 @@
version <unreleased>
Extractors
* [googledrive] Fix extraction on Python 3.6
version 2017.02.04.1
Extractors

View File

@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
lowercase_escape,
)
@ -13,12 +14,12 @@ class GoogleDriveIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
_TESTS = [{
'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
'md5': '881f7700aec4f538571fa1e0eed4a7b6',
'md5': 'd109872761f7e7ecf353fa108c0dbe1e',
'info_dict': {
'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
'ext': 'mp4',
'title': 'Big Buck Bunny.mp4',
'duration': 46,
'duration': 45,
}
}, {
# video id is longer than 28 characters
@ -55,7 +56,7 @@ class GoogleDriveIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://docs.google.com/file/d/%s' % video_id, video_id, encoding='unicode_escape')
'http://docs.google.com/file/d/%s' % video_id, video_id)
reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
if reason:
@ -74,7 +75,7 @@ class GoogleDriveIE(InfoExtractor):
resolution = fmt.split('/')[1]
width, height = resolution.split('x')
formats.append({
'url': fmt_url,
'url': lowercase_escape(fmt_url),
'format_id': fmt_id,
'resolution': resolution,
'width': int_or_none(width),