Add proper support for "gzip" and "deflate" encodings

This commit is contained in:
Ricardo Garcia 2011-01-12 20:20:37 +01:00
parent aac3fe0f4a
commit 1987c2325a
1 changed files with 72 additions and 17 deletions

View File

@ -8,6 +8,7 @@
import cookielib
import ctypes
import datetime
import gzip
import htmlentitydefs
import httplib
import locale
@ -18,11 +19,13 @@ import os.path
import re
import socket
import string
import StringIO
import subprocess
import sys
import time
import urllib
import urllib2
import zlib
# parse_qs was moved from the cgi module to the urlparse module recently.
try:
@ -161,6 +164,56 @@ class ContentTooShortError(Exception):
self.downloaded = downloaded
self.expected = expected
class YoutubeDLHandler(urllib2.HTTPHandler):
"""Handler for HTTP requests and responses.
This class, when installed with an OpenerDirector, automatically adds
the standard headers to every HTTP request and handles gzipped and
deflated responses from web servers. If compression is to be avoided in
a particular request, the original request in the program code only has
to include the HTTP header "Youtubedl-No-Compression", which will be
removed before making the real request.
Part of this code was copied from:
http://techknack.net/python-urllib2-handlers/
Andrew Rowls, the author of that code, agreed to release it to the
public domain.
"""
@staticmethod
def deflate(data):
try:
return zlib.decompress(data, -zlib.MAX_WBITS)
except zlib.error:
return zlib.decompress(data)
def http_request(self, req):
for h in std_headers:
if h in req.headers:
del req.headers[h]
req.add_header(h, std_headers[h])
if 'Youtubedl-no-compression' in req.headers:
if 'Accept-encoding' in req.headers:
del req.headers['Accept-encoding']
del req.headers['Youtubedl-no-compression']
return req
def http_response(self, req, resp):
old_resp = resp
# gzip
if resp.headers.get('Content-encoding', '') == 'gzip':
gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
# deflate
if resp.headers.get('Content-encoding', '') == 'deflate':
gz = StringIO.StringIO(self.deflate(resp.read()))
resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg
return resp
class FileDownloader(object):
"""File Downloader class.
@ -559,8 +612,11 @@ class FileDownloader(object):
tmpfilename = self.temp_name(filename)
stream = None
open_mode = 'wb'
basic_request = urllib2.Request(url, None, std_headers)
request = urllib2.Request(url, None, std_headers)
# Do not include the Accept-Encoding header
headers = {'Youtubedl-no-compression': 'True'}
basic_request = urllib2.Request(url, None, headers)
request = urllib2.Request(url, None, headers)
# Establish possible resume length
if os.path.isfile(tmpfilename):
@ -822,7 +878,7 @@ class YoutubeIE(InfoExtractor):
return
# Set language
request = urllib2.Request(self._LANG_URL, None, std_headers)
request = urllib2.Request(self._LANG_URL)
try:
self.report_lang()
urllib2.urlopen(request).read()
@ -842,7 +898,7 @@ class YoutubeIE(InfoExtractor):
'username': username,
'password': password,
}
request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
try:
self.report_login()
login_results = urllib2.urlopen(request).read()
@ -858,7 +914,7 @@ class YoutubeIE(InfoExtractor):
'next_url': '/',
'action_confirm': 'Confirm',
}
request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
try:
self.report_age_confirmation()
age_results = urllib2.urlopen(request).read()
@ -876,7 +932,7 @@ class YoutubeIE(InfoExtractor):
# Get video webpage
self.report_video_webpage_download(video_id)
request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id, None, std_headers)
request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
try:
video_webpage = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@ -895,7 +951,7 @@ class YoutubeIE(InfoExtractor):
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (video_id, el_type))
request = urllib2.Request(video_info_url, None, std_headers)
request = urllib2.Request(video_info_url)
try:
video_info_webpage = urllib2.urlopen(request).read()
video_info = parse_qs(video_info_webpage)
@ -1055,7 +1111,7 @@ class MetacafeIE(InfoExtractor):
def _real_initialize(self):
# Retrieve disclaimer
request = urllib2.Request(self._DISCLAIMER, None, std_headers)
request = urllib2.Request(self._DISCLAIMER)
try:
self.report_disclaimer()
disclaimer = urllib2.urlopen(request).read()
@ -1068,7 +1124,7 @@ class MetacafeIE(InfoExtractor):
'filters': '0',
'submit': "Continue - I'm over 18",
}
request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
try:
self.report_age_confirmation()
disclaimer = urllib2.urlopen(request).read()
@ -1771,7 +1827,7 @@ class YoutubeSearchIE(InfoExtractor):
while True:
self.report_download_page(query, pagenum)
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
request = urllib2.Request(result_url, None, std_headers)
request = urllib2.Request(result_url)
try:
page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@ -1862,7 +1918,7 @@ class GoogleSearchIE(InfoExtractor):
while True:
self.report_download_page(query, pagenum)
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
request = urllib2.Request(result_url, None, std_headers)
request = urllib2.Request(result_url)
try:
page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@ -1953,7 +2009,7 @@ class YahooSearchIE(InfoExtractor):
while True:
self.report_download_page(query, pagenum)
result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
request = urllib2.Request(result_url, None, std_headers)
request = urllib2.Request(result_url)
try:
page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@ -2017,7 +2073,7 @@ class YoutubePlaylistIE(InfoExtractor):
while True:
self.report_download_page(playlist_id, pagenum)
request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
try:
page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@ -2079,7 +2135,7 @@ class YoutubeUserIE(InfoExtractor):
pagenum = 1
self.report_download_page(username)
request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
request = urllib2.Request(self._TEMPLATE_URL % (username))
try:
page = urllib2.urlopen(request).read()
except (urllib2.URLError, httplib.HTTPException, socket.error), err:
@ -2135,7 +2191,7 @@ class DepositFilesIE(InfoExtractor):
# Retrieve file webpage with 'Free download' button pressed
free_download_indication = { 'gateway_result' : '1' }
request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
request = urllib2.Request(url, urllib.urlencode(free_download_indication))
try:
self.report_download_webpage(file_id)
webpage = urllib2.urlopen(request).read()
@ -2354,8 +2410,7 @@ if __name__ == '__main__':
# General configuration
cookie_processor = urllib2.HTTPCookieProcessor(jar)
urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
urllib2.install_opener(urllib2.build_opener(cookie_processor))
urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
# Batch file verification