2012-03-25 03:07:37 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
2012-11-28 03:34:40 +01:00
from __future__ import absolute_import
2013-01-01 21:07:37 +01:00
import base64
2012-03-25 03:07:37 +02:00
import datetime
2013-01-27 03:01:23 +01:00
import itertools
2012-03-25 03:07:37 +02:00
import netrc
import os
import re
import socket
import time
import email . utils
2012-05-01 17:01:51 +02:00
import xml . etree . ElementTree
2012-08-08 20:04:02 +02:00
import random
import math
2013-02-26 10:39:26 +01:00
import operator
2012-03-25 03:07:37 +02:00
2012-11-28 03:34:40 +01:00
from . utils import *
2012-03-25 03:07:37 +02:00
class InfoExtractor ( object ) :
2012-11-28 02:04:46 +01:00
""" Information Extractor class.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
Information extractors are the classes that , given a URL , extract
information about the video ( or videos ) the URL refers to . This
information includes the real video URL , the video title , author and
2012-12-19 15:19:08 +01:00
others . The information is stored in a dictionary which is then
2012-11-28 02:04:46 +01:00
passed to the FileDownloader . The FileDownloader processes this
information possibly downloading the video to the file system , among
other possible outcomes .
2012-11-27 17:20:25 +01:00
2012-11-28 02:04:46 +01:00
The dictionaries must include the following fields :
2012-11-27 17:20:25 +01:00
2012-11-28 02:04:46 +01:00
id : Video identifier .
url : Final video URL .
title : Video title , unescaped .
ext : Video filename extension .
2012-11-27 17:20:25 +01:00
2012-11-28 02:04:46 +01:00
The following fields are optional :
2012-11-27 17:20:25 +01:00
2012-11-28 02:04:46 +01:00
format : The video format , defaults to ext ( used for - - get - format )
thumbnail : Full URL to a video thumbnail image .
description : One - line video description .
2013-01-12 17:34:09 +01:00
uploader : Full name of the video uploader .
upload_date : Video upload date ( YYYYMMDD ) .
2012-12-20 16:28:16 +01:00
uploader_id : Nickname or id of the video uploader .
2013-01-12 17:34:31 +01:00
location : Physical location of the video .
2012-11-28 02:04:46 +01:00
player_url : SWF Player URL ( used for rtmpdump ) .
2013-02-22 03:13:28 +01:00
subtitles : The subtitle file contents .
2012-11-28 02:04:46 +01:00
urlhandle : [ internal ] The urlHandle to be used to download the file ,
like returned by urllib . request . urlopen
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
The fields should all be Unicode strings .
2012-11-27 17:57:12 +01:00
2012-11-28 02:04:46 +01:00
Subclasses of this one should re - define the _real_initialize ( ) and
_real_extract ( ) methods and define a _VALID_URL regexp .
Probably , they should also be added to the list of extractors .
2012-11-27 17:20:25 +01:00
2012-11-28 02:04:46 +01:00
_real_extract ( ) must return a * list * of information dictionaries as
described above .
2012-11-27 19:30:09 +01:00
2012-11-28 02:04:46 +01:00
Finally , the _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests .
"""
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
_ready = False
_downloader = None
_WORKING = True
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
def __init__ ( self , downloader = None ) :
""" Constructor. Receives an optional downloader. """
self . _ready = False
self . set_downloader ( downloader )
2012-03-25 03:07:37 +02:00
2013-02-26 19:02:31 +01:00
@classmethod
def suitable ( cls , url ) :
2012-11-28 02:04:46 +01:00
""" Receives a URL and returns True if suitable for this IE. """
2013-02-26 19:02:31 +01:00
return re . match ( cls . _VALID_URL , url ) is not None
2012-03-25 03:07:37 +02:00
2013-02-26 19:02:31 +01:00
@classmethod
def working ( cls ) :
2012-11-28 02:04:46 +01:00
""" Getter method for _WORKING. """
2013-02-26 19:02:31 +01:00
return cls . _WORKING
2012-11-27 19:30:09 +01:00
2012-11-28 02:04:46 +01:00
def initialize ( self ) :
""" Initializes an instance (authentication, etc). """
if not self . _ready :
self . _real_initialize ( )
self . _ready = True
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
def extract ( self , url ) :
""" Extracts URL information and returns it in list of dicts. """
self . initialize ( )
return self . _real_extract ( url )
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
def set_downloader ( self , downloader ) :
""" Sets the downloader for this IE. """
self . _downloader = downloader
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
def _real_initialize ( self ) :
""" Real initialization process. Redefine in subclasses. """
pass
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
def _real_extract ( self , url ) :
""" Real extraction process. Redefine in subclasses. """
pass
2012-03-25 03:07:37 +02:00
2012-12-27 01:38:41 +01:00
@property
def IE_NAME ( self ) :
return type ( self ) . __name__ [ : - 2 ]
2012-03-25 03:07:37 +02:00
2013-01-12 16:10:16 +01:00
def _request_webpage ( self , url_or_request , video_id , note = None , errnote = None ) :
""" Returns the response handle """
2013-01-01 20:43:43 +01:00
if note is None :
2013-04-24 22:11:57 +02:00
self . report_download_webpage ( video_id )
elif note is not False :
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : %s ' % ( video_id , note ) )
2013-01-01 20:43:43 +01:00
try :
2013-01-12 16:10:16 +01:00
return compat_urllib_request . urlopen ( url_or_request )
2013-01-01 20:43:43 +01:00
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
if errnote is None :
errnote = u ' Unable to download webpage '
2013-01-03 15:39:55 +01:00
raise ExtractorError ( u ' %s : %s ' % ( errnote , compat_str ( err ) ) , sys . exc_info ( ) [ 2 ] )
2013-01-01 20:43:43 +01:00
2013-05-04 02:53:26 +02:00
def _download_webpage_handle ( self , url_or_request , video_id , note = None , errnote = None ) :
""" Returns a tuple (page content as string, URL handle) """
2013-01-12 16:10:16 +01:00
urlh = self . _request_webpage ( url_or_request , video_id , note , errnote )
2013-03-12 01:08:54 +01:00
content_type = urlh . headers . get ( ' Content-Type ' , ' ' )
m = re . match ( r ' [a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+ \ s*; \ s*charset=(.+) ' , content_type )
if m :
encoding = m . group ( 1 )
else :
encoding = ' utf-8 '
2013-01-12 16:10:16 +01:00
webpage_bytes = urlh . read ( )
2013-04-11 18:31:35 +02:00
if self . _downloader . params . get ( ' dump_intermediate_pages ' , False ) :
try :
url = url_or_request . get_full_url ( )
except AttributeError :
url = url_or_request
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Dumping request to ' + url )
2013-04-11 18:31:35 +02:00
dump = base64 . b64encode ( webpage_bytes ) . decode ( ' ascii ' )
self . _downloader . to_screen ( dump )
2013-05-04 02:53:26 +02:00
content = webpage_bytes . decode ( encoding , ' replace ' )
return ( content , urlh )
def _download_webpage ( self , url_or_request , video_id , note = None , errnote = None ) :
""" Returns the data of the page as a string """
return self . _download_webpage_handle ( url_or_request , video_id , note , errnote ) [ 0 ]
2013-04-20 19:35:49 +02:00
def to_screen ( self , msg ) :
""" Print msg to screen, prefixing it with ' [ie_name] ' """
self . _downloader . to_screen ( u ' [ %s ] %s ' % ( self . IE_NAME , msg ) )
2013-04-20 21:12:29 +02:00
def report_extraction ( self , id_or_name ) :
""" Report information extraction. """
self . to_screen ( u ' %s : Extracting information ' % id_or_name )
2013-04-24 22:02:20 +02:00
def report_download_webpage ( self , video_id ) :
""" Report webpage download. """
self . to_screen ( u ' %s : Downloading webpage ' % video_id )
2013-04-21 21:56:13 +02:00
def report_age_confirmation ( self ) :
""" Report attempt to confirm age. """
self . to_screen ( u ' Confirming age ' )
2013-03-05 20:55:48 +01:00
#Methods for following #608
#They set the correct value of the '_type' key
def video_result ( self , video_info ) :
""" Returns a video """
video_info [ ' _type ' ] = ' video '
return video_info
def url_result ( self , url , ie = None ) :
""" Returns a url that points to a page that should be processed """
#TODO: ie should be the class used for getting the info
video_info = { ' _type ' : ' url ' ,
2013-04-20 12:50:14 +02:00
' url ' : url ,
' ie_key ' : ie }
2013-03-05 20:55:48 +01:00
return video_info
2013-03-28 13:39:00 +01:00
def playlist_result ( self , entries , playlist_id = None , playlist_title = None ) :
2013-03-05 20:55:48 +01:00
""" Returns a playlist """
video_info = { ' _type ' : ' playlist ' ,
' entries ' : entries }
2013-03-28 13:39:00 +01:00
if playlist_id :
video_info [ ' id ' ] = playlist_id
if playlist_title :
video_info [ ' title ' ] = playlist_title
2013-03-05 20:55:48 +01:00
return video_info
2013-01-12 16:10:16 +01:00
2013-01-01 20:43:43 +01:00
2012-03-25 03:07:37 +02:00
class YoutubeIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for youtube.com. """
_VALID_URL = r """ ^
(
( ? : https ? : / / ) ? # http(s):// (optional)
( ? : youtu \. be / | ( ? : \w + \. ) ? youtube ( ? : - nocookie ) ? \. com / |
tube \. majestyc \. net / ) # the various hostnames, with wildcard subdomains
( ? : . * ? \#/)? # handle anchor (#/) redirect urls
( ? : # the various things that can precede the ID:
( ? : ( ? : v | embed | e ) / ) # v/ or embed/ or e/
| ( ? : # or the v= param in all its forms
( ? : watch ( ? : _popup ) ? ( ? : \. php ) ? ) ? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
( ? : \? | \#!?) # the params delimiter ? or # or #!
2012-12-27 05:31:36 +01:00
( ? : . * ? & ) ? # any other preceding param (like /?s=tuff&v=xxxx)
2012-11-28 02:04:46 +01:00
v =
)
) ? # optional -> youtube.com/xxxx is OK
) ? # all until now is optional -> you can pass the naked ID
( [ 0 - 9 A - Za - z_ - ] + ) # here is it! the YouTube video ID
( ? ( 1 ) . + ) ? # if we found the ID, everything can follow
$ """
2013-05-04 07:49:25 +02:00
_LANG_URL = r ' https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1 '
2013-02-06 21:22:53 +01:00
_LOGIN_URL = ' https://accounts.google.com/ServiceLogin '
2012-11-28 02:04:46 +01:00
_AGE_URL = ' http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en '
_NEXT_URL_RE = r ' [ \ ?&]next_url=([^&]+) '
_NETRC_MACHINE = ' youtube '
# Listed in order of quality
_available_formats = [ ' 38 ' , ' 37 ' , ' 46 ' , ' 22 ' , ' 45 ' , ' 35 ' , ' 44 ' , ' 34 ' , ' 18 ' , ' 43 ' , ' 6 ' , ' 5 ' , ' 17 ' , ' 13 ' ]
_available_formats_prefer_free = [ ' 38 ' , ' 46 ' , ' 37 ' , ' 45 ' , ' 22 ' , ' 44 ' , ' 35 ' , ' 43 ' , ' 34 ' , ' 18 ' , ' 6 ' , ' 5 ' , ' 17 ' , ' 13 ' ]
_video_extensions = {
' 13 ' : ' 3gp ' ,
' 17 ' : ' mp4 ' ,
' 18 ' : ' mp4 ' ,
' 22 ' : ' mp4 ' ,
' 37 ' : ' mp4 ' ,
' 38 ' : ' video ' , # You actually don't know if this will be MOV, AVI or whatever
' 43 ' : ' webm ' ,
' 44 ' : ' webm ' ,
' 45 ' : ' webm ' ,
' 46 ' : ' webm ' ,
}
_video_dimensions = {
' 5 ' : ' 240x400 ' ,
' 6 ' : ' ??? ' ,
' 13 ' : ' ??? ' ,
' 17 ' : ' 144x176 ' ,
' 18 ' : ' 360x640 ' ,
' 22 ' : ' 720x1280 ' ,
' 34 ' : ' 360x640 ' ,
' 35 ' : ' 480x854 ' ,
' 37 ' : ' 1080x1920 ' ,
' 38 ' : ' 3072x4096 ' ,
' 43 ' : ' 360x640 ' ,
' 44 ' : ' 480x854 ' ,
' 45 ' : ' 720x1280 ' ,
' 46 ' : ' 1080x1920 ' ,
2012-12-19 15:19:08 +01:00
}
2012-11-28 02:04:46 +01:00
IE_NAME = u ' youtube '
2013-02-26 19:02:31 +01:00
@classmethod
def suitable ( cls , url ) :
2012-11-28 02:04:46 +01:00
""" Receives a URL and returns True if suitable for this IE. """
2013-02-26 19:02:31 +01:00
if YoutubePlaylistIE . suitable ( url ) : return False
return re . match ( cls . _VALID_URL , url , re . VERBOSE ) is not None
2012-11-28 02:04:46 +01:00
def report_lang ( self ) :
""" Report attempt to set language. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Setting language ' )
2012-11-28 02:04:46 +01:00
def report_login ( self ) :
""" Report attempt to log in. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Logging in ' )
2012-11-28 02:04:46 +01:00
def report_video_webpage_download ( self , video_id ) :
""" Report attempt to download video webpage. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : Downloading video webpage ' % video_id )
2012-11-28 02:04:46 +01:00
def report_video_info_webpage_download ( self , video_id ) :
""" Report attempt to download video info webpage. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : Downloading video info webpage ' % video_id )
2012-11-28 02:04:46 +01:00
def report_video_subtitles_download ( self , video_id ) :
""" Report attempt to download video info webpage. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : Checking available subtitles ' % video_id )
2012-11-28 02:04:46 +01:00
2013-02-22 04:50:05 +01:00
def report_video_subtitles_request ( self , video_id , sub_lang , format ) :
2013-02-22 02:52:55 +01:00
""" Report attempt to download video info webpage. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : Downloading video subtitles for %s . %s ' % ( video_id , sub_lang , format ) )
2013-02-22 04:50:05 +01:00
def report_video_subtitles_available ( self , video_id , sub_lang_list ) :
""" Report available subtitles. """
sub_lang = " , " . join ( list ( sub_lang_list . keys ( ) ) )
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : Available subtitles for video: %s ' % ( video_id , sub_lang ) )
2012-11-28 02:04:46 +01:00
def report_information_extraction ( self , video_id ) :
""" Report attempt to extract video information. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : Extracting video information ' % video_id )
2012-11-28 02:04:46 +01:00
def report_unavailable_format ( self , video_id , format ) :
""" Report extracted video URL. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : Format %s not available ' % ( video_id , format ) )
2012-11-28 02:04:46 +01:00
def report_rtmp_download ( self ) :
""" Indicate the download will use the RTMP protocol. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' RTMP download detected ' )
2012-11-28 02:04:46 +01:00
2013-02-22 02:52:55 +01:00
def _get_available_subtitles ( self , video_id ) :
2012-12-20 11:26:38 +01:00
self . report_video_subtitles_download ( video_id )
request = compat_urllib_request . Request ( ' http://video.google.com/timedtext?hl=en&type=list&v= %s ' % video_id )
try :
2013-02-22 03:13:28 +01:00
sub_list = compat_urllib_request . urlopen ( request ) . read ( ) . decode ( ' utf-8 ' )
2012-12-20 11:26:38 +01:00
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-31 12:10:12 +02:00
return ( u ' unable to download video subtitles: %s ' % compat_str ( err ) , None )
2013-02-22 03:13:28 +01:00
sub_lang_list = re . findall ( r ' name= " ([^ " ]*) " [^>]+lang_code= " ([ \ w \ -]+) " ' , sub_list )
sub_lang_list = dict ( ( l [ 1 ] , l [ 0 ] ) for l in sub_lang_list )
if not sub_lang_list :
2013-03-31 12:10:12 +02:00
return ( u ' video doesn \' t have subtitles ' , None )
2013-02-22 03:13:28 +01:00
return sub_lang_list
2013-02-22 02:52:55 +01:00
2013-02-22 04:50:05 +01:00
def _list_available_subtitles ( self , video_id ) :
sub_lang_list = self . _get_available_subtitles ( video_id )
self . report_video_subtitles_available ( video_id , sub_lang_list )
2013-02-22 03:53:54 +01:00
def _request_subtitle ( self , sub_lang , sub_name , video_id , format ) :
2013-03-30 14:17:12 +01:00
"""
Return tuple :
( error_message , sub_lang , sub )
"""
2013-02-22 04:50:05 +01:00
self . report_video_subtitles_request ( video_id , sub_lang , format )
2013-02-05 13:30:02 +01:00
params = compat_urllib_parse . urlencode ( {
2013-02-22 03:13:28 +01:00
' lang ' : sub_lang ,
' name ' : sub_name ,
2013-02-05 13:30:02 +01:00
' v ' : video_id ,
2013-02-22 02:52:55 +01:00
' fmt ' : format ,
2013-02-05 13:30:02 +01:00
} )
url = ' http://www.youtube.com/api/timedtext? ' + params
2012-12-20 11:26:38 +01:00
try :
2013-02-22 03:13:28 +01:00
sub = compat_urllib_request . urlopen ( url ) . read ( ) . decode ( ' utf-8 ' )
2012-12-20 11:26:38 +01:00
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-31 12:10:12 +02:00
return ( u ' unable to download video subtitles: %s ' % compat_str ( err ) , None , None )
2013-02-22 03:13:28 +01:00
if not sub :
2013-03-31 12:10:12 +02:00
return ( u ' Did not fetch video subtitles ' , None , None )
2013-02-22 03:13:28 +01:00
return ( None , sub_lang , sub )
2013-02-22 02:52:55 +01:00
def _extract_subtitle ( self , video_id ) :
2013-03-30 14:11:33 +01:00
"""
Return a list with a tuple :
[ ( error_message , sub_lang , sub ) ]
"""
2013-02-22 03:13:28 +01:00
sub_lang_list = self . _get_available_subtitles ( video_id )
2013-02-22 03:53:54 +01:00
sub_format = self . _downloader . params . get ( ' subtitlesformat ' )
2013-03-30 14:11:33 +01:00
if isinstance ( sub_lang_list , tuple ) : #There was some error, it didn't get the available subtitles
return [ ( sub_lang_list [ 0 ] , None , None ) ]
2013-02-22 02:52:55 +01:00
if self . _downloader . params . get ( ' subtitleslang ' , False ) :
2013-02-22 03:13:28 +01:00
sub_lang = self . _downloader . params . get ( ' subtitleslang ' )
elif ' en ' in sub_lang_list :
sub_lang = ' en '
2013-02-22 02:52:55 +01:00
else :
2013-02-22 03:13:28 +01:00
sub_lang = list ( sub_lang_list . keys ( ) ) [ 0 ]
if not sub_lang in sub_lang_list :
2013-03-31 12:10:12 +02:00
return [ ( u ' no closed captions found in the specified language " %s " ' % sub_lang , None , None ) ]
2013-02-22 02:52:55 +01:00
2013-02-22 03:53:54 +01:00
subtitle = self . _request_subtitle ( sub_lang , sub_lang_list [ sub_lang ] . encode ( ' utf-8 ' ) , video_id , sub_format )
2013-02-22 03:13:28 +01:00
return [ subtitle ]
2013-02-22 02:52:55 +01:00
def _extract_all_subtitles ( self , video_id ) :
2013-02-22 03:13:28 +01:00
sub_lang_list = self . _get_available_subtitles ( video_id )
2013-02-22 03:53:54 +01:00
sub_format = self . _downloader . params . get ( ' subtitlesformat ' )
2013-03-31 12:19:13 +02:00
if isinstance ( sub_lang_list , tuple ) : #There was some error, it didn't get the available subtitles
return [ ( sub_lang_list [ 0 ] , None , None ) ]
2013-02-22 03:13:28 +01:00
subtitles = [ ]
for sub_lang in sub_lang_list :
2013-02-22 03:53:54 +01:00
subtitle = self . _request_subtitle ( sub_lang , sub_lang_list [ sub_lang ] . encode ( ' utf-8 ' ) , video_id , sub_format )
2013-02-22 03:13:28 +01:00
subtitles . append ( subtitle )
return subtitles
2012-12-20 11:26:38 +01:00
2012-11-28 02:04:46 +01:00
def _print_formats ( self , formats ) :
print ( ' Available formats: ' )
for x in formats :
print ( ' %s \t : \t %s \t [ %s ] ' % ( x , self . _video_extensions . get ( x , ' flv ' ) , self . _video_dimensions . get ( x , ' ??? ' ) ) )
def _real_initialize ( self ) :
if self . _downloader is None :
return
username = None
password = None
downloader_params = self . _downloader . params
# Attempt to use provided username and password or .netrc data
if downloader_params . get ( ' username ' , None ) is not None :
username = downloader_params [ ' username ' ]
password = downloader_params [ ' password ' ]
elif downloader_params . get ( ' usenetrc ' , False ) :
try :
info = netrc . netrc ( ) . authenticators ( self . _NETRC_MACHINE )
if info is not None :
username = info [ 0 ]
password = info [ 2 ]
else :
raise netrc . NetrcParseError ( ' No authenticators for %s ' % self . _NETRC_MACHINE )
except ( IOError , netrc . NetrcParseError ) as err :
2013-03-02 11:24:07 +01:00
self . _downloader . report_warning ( u ' parsing .netrc: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
# Set language
request = compat_urllib_request . Request ( self . _LANG_URL )
try :
self . report_lang ( )
compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-02 11:24:07 +01:00
self . _downloader . report_warning ( u ' unable to set language: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
# No authentication to be performed
if username is None :
return
2013-02-06 21:22:53 +01:00
request = compat_urllib_request . Request ( self . _LOGIN_URL )
try :
login_page = compat_urllib_request . urlopen ( request ) . read ( ) . decode ( ' utf-8 ' )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-02 11:24:07 +01:00
self . _downloader . report_warning ( u ' unable to fetch login page: %s ' % compat_str ( err ) )
2013-02-06 21:22:53 +01:00
return
galx = None
dsh = None
match = re . search ( re . compile ( r ' <input.+?name= " GALX " .+?value= " (.+?) " ' , re . DOTALL ) , login_page )
if match :
galx = match . group ( 1 )
match = re . search ( re . compile ( r ' <input.+?name= " dsh " .+?value= " (.+?) " ' , re . DOTALL ) , login_page )
if match :
dsh = match . group ( 1 )
2012-11-28 02:04:46 +01:00
# Log in
2013-02-06 21:22:53 +01:00
login_form_strs = {
2013-05-04 07:49:25 +02:00
u ' continue ' : u ' https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1 ' ,
2013-02-06 21:22:53 +01:00
u ' Email ' : username ,
u ' GALX ' : galx ,
u ' Passwd ' : password ,
u ' PersistentCookie ' : u ' yes ' ,
u ' _utf8 ' : u ' 霱 ' ,
u ' bgresponse ' : u ' js_disabled ' ,
u ' checkConnection ' : u ' ' ,
u ' checkedDomains ' : u ' youtube ' ,
u ' dnConn ' : u ' ' ,
u ' dsh ' : dsh ,
u ' pstMsg ' : u ' 0 ' ,
u ' rmShown ' : u ' 1 ' ,
u ' secTok ' : u ' ' ,
u ' signIn ' : u ' Sign in ' ,
u ' timeStmp ' : u ' ' ,
u ' service ' : u ' youtube ' ,
u ' uilel ' : u ' 3 ' ,
u ' hl ' : u ' en_US ' ,
}
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
# chokes on unicode
login_form = dict ( ( k . encode ( ' utf-8 ' ) , v . encode ( ' utf-8 ' ) ) for k , v in login_form_strs . items ( ) )
login_data = compat_urllib_parse . urlencode ( login_form ) . encode ( ' ascii ' )
request = compat_urllib_request . Request ( self . _LOGIN_URL , login_data )
2012-11-28 02:04:46 +01:00
try :
self . report_login ( )
2012-12-17 16:25:03 +01:00
login_results = compat_urllib_request . urlopen ( request ) . read ( ) . decode ( ' utf-8 ' )
2013-02-06 21:22:53 +01:00
if re . search ( r ' (?i)<form[^>]* id= " gaia_loginform " ' , login_results ) is not None :
2013-03-02 11:24:07 +01:00
self . _downloader . report_warning ( u ' unable to log in: bad username or password ' )
2012-11-28 02:04:46 +01:00
return
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-02 11:24:07 +01:00
self . _downloader . report_warning ( u ' unable to log in: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
# Confirm age
age_form = {
' next_url ' : ' / ' ,
' action_confirm ' : ' Confirm ' ,
}
request = compat_urllib_request . Request ( self . _AGE_URL , compat_urllib_parse . urlencode ( age_form ) )
try :
self . report_age_confirmation ( )
2012-12-17 16:25:03 +01:00
age_results = compat_urllib_request . urlopen ( request ) . read ( ) . decode ( ' utf-8 ' )
2012-11-28 02:04:46 +01:00
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to confirm age: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
2012-12-27 05:31:36 +01:00
def _extract_id ( self , url ) :
2012-11-28 02:04:46 +01:00
mobj = re . match ( self . _VALID_URL , url , re . VERBOSE )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
video_id = mobj . group ( 2 )
2012-12-27 05:31:36 +01:00
return video_id
def _real_extract ( self , url ) :
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re . search ( self . _NEXT_URL_RE , url )
if mobj :
2013-05-04 07:49:25 +02:00
url = ' https://www.youtube.com/ ' + compat_urllib_parse . unquote ( mobj . group ( 1 ) ) . lstrip ( ' / ' )
2012-12-27 05:31:36 +01:00
video_id = self . _extract_id ( url )
2012-11-28 02:04:46 +01:00
# Get video webpage
self . report_video_webpage_download ( video_id )
2013-05-04 07:49:25 +02:00
url = ' https://www.youtube.com/watch?v= %s &gl=US&hl=en&has_verified=1 ' % video_id
2012-12-27 05:31:36 +01:00
request = compat_urllib_request . Request ( url )
2012-11-28 02:04:46 +01:00
try :
video_webpage_bytes = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to download video webpage: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
video_webpage = video_webpage_bytes . decode ( ' utf-8 ' , ' ignore ' )
# Attempt to extract SWF player URL
mobj = re . search ( r ' swfConfig.*? " (http: \\ / \\ /.*?watch.*?-.*? \ .swf) " ' , video_webpage )
if mobj is not None :
player_url = re . sub ( r ' \\ (.) ' , r ' \ 1 ' , mobj . group ( 1 ) )
else :
player_url = None
# Get video info
self . report_video_info_webpage_download ( video_id )
for el_type in [ ' &el=embedded ' , ' &el=detailpage ' , ' &el=vevo ' , ' ' ] :
2013-04-11 18:18:15 +02:00
video_info_url = ( ' https://www.youtube.com/get_video_info?&video_id= %s %s &ps=default&eurl=&gl=US&hl=en '
2012-11-28 02:04:46 +01:00
% ( video_id , el_type ) )
2013-04-11 18:18:15 +02:00
video_info_webpage = self . _download_webpage ( video_info_url , video_id ,
note = False ,
errnote = ' unable to download video info webpage ' )
video_info = compat_parse_qs ( video_info_webpage )
if ' token ' in video_info :
break
2012-11-28 02:04:46 +01:00
if ' token ' not in video_info :
if ' reason ' in video_info :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' YouTube said: %s ' % video_info [ ' reason ' ] [ 0 ] )
2012-11-28 02:04:46 +01:00
else :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' " token " parameter not in video info for unknown reason ' )
2012-11-28 02:04:46 +01:00
return
# Check for "rental" videos
if ' ypc_video_rental_bar_text ' in video_info and ' author ' not in video_info :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' " rental " videos not supported ' )
2012-11-28 02:04:46 +01:00
return
# Start extracting information
self . report_information_extraction ( video_id )
# uploader
if ' author ' not in video_info :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract uploader name ' )
2012-11-28 02:04:46 +01:00
return
video_uploader = compat_urllib_parse . unquote_plus ( video_info [ ' author ' ] [ 0 ] )
2012-12-20 16:28:16 +01:00
# uploader_id
video_uploader_id = None
2013-01-02 19:12:44 +01:00
mobj = re . search ( r ' <link itemprop= " url " href= " http://www.youtube.com/(?:user|channel)/([^ " ]+) " > ' , video_webpage )
2012-12-20 16:28:16 +01:00
if mobj is not None :
video_uploader_id = mobj . group ( 1 )
else :
2013-03-05 21:13:17 +01:00
self . _downloader . report_warning ( u ' unable to extract uploader nickname ' )
2012-12-20 16:28:16 +01:00
2012-11-28 02:04:46 +01:00
# title
if ' title ' not in video_info :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video title ' )
2012-11-28 02:04:46 +01:00
return
video_title = compat_urllib_parse . unquote_plus ( video_info [ ' title ' ] [ 0 ] )
# thumbnail image
if ' thumbnail_url ' not in video_info :
2013-03-05 21:13:17 +01:00
self . _downloader . report_warning ( u ' unable to extract video thumbnail ' )
2012-11-28 02:04:46 +01:00
video_thumbnail = ' '
else : # don't panic if we can't find it
video_thumbnail = compat_urllib_parse . unquote_plus ( video_info [ ' thumbnail_url ' ] [ 0 ] )
# upload date
upload_date = None
mobj = re . search ( r ' id= " eow-date.*?>(.*?)</span> ' , video_webpage , re . DOTALL )
if mobj is not None :
upload_date = ' ' . join ( re . sub ( r ' [/,-] ' , r ' ' , mobj . group ( 1 ) ) . split ( ) )
2013-04-27 15:14:20 +02:00
upload_date = unified_strdate ( upload_date )
2012-11-28 02:04:46 +01:00
# description
video_description = get_element_by_id ( " eow-description " , video_webpage )
if video_description :
video_description = clean_html ( video_description )
else :
2013-04-23 13:54:17 +02:00
fd_mobj = re . search ( r ' <meta name= " description " content= " ([^ " ]+) " ' , video_webpage )
if fd_mobj :
video_description = unescapeHTML ( fd_mobj . group ( 1 ) )
else :
video_description = u ' '
2012-11-28 02:04:46 +01:00
2013-02-22 03:53:54 +01:00
# subtitles
2012-11-28 02:04:46 +01:00
video_subtitles = None
2013-02-22 02:52:55 +01:00
2012-11-28 02:04:46 +01:00
if self . _downloader . params . get ( ' writesubtitles ' , False ) :
2013-02-22 02:52:55 +01:00
video_subtitles = self . _extract_subtitle ( video_id )
if video_subtitles :
2013-02-22 03:13:28 +01:00
( sub_error , sub_lang , sub ) = video_subtitles [ 0 ]
if sub_error :
2013-03-31 12:10:12 +02:00
self . _downloader . report_error ( sub_error )
2013-02-22 02:52:55 +01:00
if self . _downloader . params . get ( ' allsubtitles ' , False ) :
video_subtitles = self . _extract_all_subtitles ( video_id )
for video_subtitle in video_subtitles :
2013-02-22 03:13:28 +01:00
( sub_error , sub_lang , sub ) = video_subtitle
if sub_error :
2013-03-31 12:10:12 +02:00
self . _downloader . report_error ( sub_error )
2012-11-28 02:04:46 +01:00
2013-02-22 04:50:05 +01:00
if self . _downloader . params . get ( ' listsubtitles ' , False ) :
sub_lang_list = self . _list_available_subtitles ( video_id )
return
2012-11-28 02:04:46 +01:00
if ' length_seconds ' not in video_info :
2013-03-05 21:13:17 +01:00
self . _downloader . report_warning ( u ' unable to extract video duration ' )
2012-11-28 02:04:46 +01:00
video_duration = ' '
else :
video_duration = compat_urllib_parse . unquote_plus ( video_info [ ' length_seconds ' ] [ 0 ] )
# token
video_token = compat_urllib_parse . unquote_plus ( video_info [ ' token ' ] [ 0 ] )
# Decide which formats to download
req_format = self . _downloader . params . get ( ' format ' , None )
if ' conn ' in video_info and video_info [ ' conn ' ] [ 0 ] . startswith ( ' rtmp ' ) :
self . report_rtmp_download ( )
video_url_list = [ ( None , video_info [ ' conn ' ] [ 0 ] ) ]
elif ' url_encoded_fmt_stream_map ' in video_info and len ( video_info [ ' url_encoded_fmt_stream_map ' ] ) > = 1 :
url_data_strs = video_info [ ' url_encoded_fmt_stream_map ' ] [ 0 ] . split ( ' , ' )
url_data = [ compat_parse_qs ( uds ) for uds in url_data_strs ]
2012-12-20 14:18:23 +01:00
url_data = [ ud for ud in url_data if ' itag ' in ud and ' url ' in ud ]
2013-04-17 23:21:58 +02:00
url_map = dict ( ( ud [ ' itag ' ] [ 0 ] , ud [ ' url ' ] [ 0 ] + ' &signature= ' + ud [ ' sig ' ] [ 0 ] ) for ud in url_data )
2012-11-28 02:04:46 +01:00
format_limit = self . _downloader . params . get ( ' format_limit ' , None )
available_formats = self . _available_formats_prefer_free if self . _downloader . params . get ( ' prefer_free_formats ' , False ) else self . _available_formats
if format_limit is not None and format_limit in available_formats :
format_list = available_formats [ available_formats . index ( format_limit ) : ]
else :
format_list = available_formats
existing_formats = [ x for x in format_list if x in url_map ]
if len ( existing_formats ) == 0 :
2013-04-22 19:51:42 +02:00
raise ExtractorError ( u ' no known formats available for video ' )
2012-11-28 02:04:46 +01:00
if self . _downloader . params . get ( ' listformats ' , None ) :
self . _print_formats ( existing_formats )
return
if req_format is None or req_format == ' best ' :
video_url_list = [ ( existing_formats [ 0 ] , url_map [ existing_formats [ 0 ] ] ) ] # Best quality
elif req_format == ' worst ' :
video_url_list = [ ( existing_formats [ len ( existing_formats ) - 1 ] , url_map [ existing_formats [ len ( existing_formats ) - 1 ] ] ) ] # worst quality
elif req_format in ( ' -1 ' , ' all ' ) :
video_url_list = [ ( f , url_map [ f ] ) for f in existing_formats ] # All formats
else :
# Specific formats. We pick the first in a slash-delimeted sequence.
# For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
req_formats = req_format . split ( ' / ' )
video_url_list = None
for rf in req_formats :
if rf in url_map :
video_url_list = [ ( rf , url_map [ rf ] ) ]
break
if video_url_list is None :
2013-04-22 19:51:42 +02:00
raise ExtractorError ( u ' requested format not available ' )
2012-11-28 02:04:46 +01:00
else :
2013-04-22 19:51:42 +02:00
raise ExtractorError ( u ' no conn or url_encoded_fmt_stream_map information found in video info ' )
2012-11-28 02:04:46 +01:00
results = [ ]
for format_param , video_real_url in video_url_list :
# Extension
video_extension = self . _video_extensions . get ( format_param , ' flv ' )
2012-11-28 11:28:59 +01:00
video_format = ' {0} - {1} ' . format ( format_param if format_param else video_extension ,
self . _video_dimensions . get ( format_param , ' ??? ' ) )
2012-11-28 02:04:46 +01:00
results . append ( {
' id ' : video_id ,
' url ' : video_real_url ,
' uploader ' : video_uploader ,
2012-12-20 16:28:16 +01:00
' uploader_id ' : video_uploader_id ,
2012-11-28 02:04:46 +01:00
' upload_date ' : upload_date ,
' title ' : video_title ,
' ext ' : video_extension ,
' format ' : video_format ,
' thumbnail ' : video_thumbnail ,
' description ' : video_description ,
' player_url ' : player_url ,
' subtitles ' : video_subtitles ,
' duration ' : video_duration
} )
return results
2012-03-25 03:07:37 +02:00
class MetacafeIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information Extractor for metacafe.com. """
_VALID_URL = r ' (?:http://)?(?:www \ .)?metacafe \ .com/watch/([^/]+)/([^/]+)/.* '
_DISCLAIMER = ' http://www.metacafe.com/family_filter/ '
_FILTER_POST = ' http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user '
IE_NAME = u ' metacafe '
def report_disclaimer ( self ) :
""" Report disclaimer retrieval. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Retrieving disclaimer ' )
2012-11-28 02:04:46 +01:00
def _real_initialize ( self ) :
# Retrieve disclaimer
request = compat_urllib_request . Request ( self . _DISCLAIMER )
try :
self . report_disclaimer ( )
disclaimer = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to retrieve disclaimer: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
# Confirm age
disclaimer_form = {
' filters ' : ' 0 ' ,
' submit ' : " Continue - I ' m over 18 " ,
}
request = compat_urllib_request . Request ( self . _FILTER_POST , compat_urllib_parse . urlencode ( disclaimer_form ) )
try :
self . report_age_confirmation ( )
disclaimer = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to confirm age: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
def _real_extract ( self , url ) :
# Extract id and simplified title from URL
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
video_id = mobj . group ( 1 )
# Check if video comes from YouTube
mobj2 = re . match ( r ' ^yt-(.*)$ ' , video_id )
if mobj2 is not None :
2013-04-20 12:50:14 +02:00
return [ self . url_result ( ' http://www.youtube.com/watch?v= %s ' % mobj2 . group ( 1 ) , ' Youtube ' ) ]
2012-11-28 02:04:46 +01:00
# Retrieve video webpage to extract further information
2013-04-20 12:06:58 +02:00
webpage = self . _download_webpage ( ' http://www.metacafe.com/watch/ %s / ' % video_id , video_id )
2012-11-28 02:04:46 +01:00
# Extract URL, uploader and title from webpage
self . report_extraction ( video_id )
mobj = re . search ( r ' (?m)&mediaURL=([^&]+) ' , webpage )
if mobj is not None :
mediaURL = compat_urllib_parse . unquote ( mobj . group ( 1 ) )
video_extension = mediaURL [ - 3 : ]
# Extract gdaKey if available
mobj = re . search ( r ' (?m)&gdaKey=(.*?)& ' , webpage )
if mobj is None :
video_url = mediaURL
else :
gdaKey = mobj . group ( 1 )
video_url = ' %s ?__gda__= %s ' % ( mediaURL , gdaKey )
else :
mobj = re . search ( r ' name= " flashvars " value= " (.*?) " ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract media URL ' )
2012-11-28 02:04:46 +01:00
return
vardict = compat_parse_qs ( mobj . group ( 1 ) )
if ' mediaData ' not in vardict :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract media URL ' )
2012-11-28 02:04:46 +01:00
return
2013-04-20 12:06:58 +02:00
mobj = re . search ( r ' " mediaURL " : " (?P<mediaURL>http.*?) " ,(.*?) " key " : " (?P<key>.*?) " ' , vardict [ ' mediaData ' ] [ 0 ] )
2012-11-28 02:04:46 +01:00
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract media URL ' )
2012-11-28 02:04:46 +01:00
return
2013-04-20 12:06:58 +02:00
mediaURL = mobj . group ( ' mediaURL ' ) . replace ( ' \\ / ' , ' / ' )
2012-11-28 02:04:46 +01:00
video_extension = mediaURL [ - 3 : ]
2013-04-20 12:06:58 +02:00
video_url = ' %s ?__gda__= %s ' % ( mediaURL , mobj . group ( ' key ' ) )
2012-11-28 02:04:46 +01:00
mobj = re . search ( r ' (?im)<title>(.*) - Video</title> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract title ' )
2012-11-28 02:04:46 +01:00
return
video_title = mobj . group ( 1 ) . decode ( ' utf-8 ' )
mobj = re . search ( r ' submitter=(.*?); ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract uploader nickname ' )
2012-11-28 02:04:46 +01:00
return
video_uploader = mobj . group ( 1 )
return [ {
' id ' : video_id . decode ( ' utf-8 ' ) ,
' url ' : video_url . decode ( ' utf-8 ' ) ,
' uploader ' : video_uploader . decode ( ' utf-8 ' ) ,
' upload_date ' : None ,
' title ' : video_title ,
' ext ' : video_extension . decode ( ' utf-8 ' ) ,
} ]
2012-03-25 03:07:37 +02:00
class DailymotionIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information Extractor for Dailymotion """
_VALID_URL = r ' (?i)(?:https?://)?(?:www \ .)?dailymotion \ .[a-z] { 2,3}/video/([^/]+) '
IE_NAME = u ' dailymotion '
def _real_extract ( self , url ) :
# Extract id and simplified title from URL
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
video_id = mobj . group ( 1 ) . split ( ' _ ' ) [ 0 ] . split ( ' ? ' ) [ 0 ]
video_extension = ' mp4 '
# Retrieve video webpage to extract further information
request = compat_urllib_request . Request ( url )
request . add_header ( ' Cookie ' , ' family_filter=off ' )
2013-01-01 21:22:30 +01:00
webpage = self . _download_webpage ( request , video_id )
2012-11-28 02:04:46 +01:00
# Extract URL, uploader and title from webpage
self . report_extraction ( video_id )
mobj = re . search ( r ' \ s*var flashvars = (.*) ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract media URL ' )
2012-11-28 02:04:46 +01:00
return
flashvars = compat_urllib_parse . unquote ( mobj . group ( 1 ) )
for key in [ ' hd1080URL ' , ' hd720URL ' , ' hqURL ' , ' sdURL ' , ' ldURL ' , ' video_url ' ] :
if key in flashvars :
max_quality = key
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Using %s ' % key )
2012-11-28 02:04:46 +01:00
break
else :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video URL ' )
2012-11-28 02:04:46 +01:00
return
mobj = re . search ( r ' " ' + max_quality + r ' " : " (.+?) " ' , flashvars )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video URL ' )
2012-11-28 02:04:46 +01:00
return
video_url = compat_urllib_parse . unquote ( mobj . group ( 1 ) ) . replace ( ' \\ / ' , ' / ' )
# TODO: support choosing qualities
mobj = re . search ( r ' <meta property= " og:title " content= " (?P<title>[^ " ]*) " /> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract title ' )
2012-11-28 02:04:46 +01:00
return
2012-12-15 18:23:17 +01:00
video_title = unescapeHTML ( mobj . group ( ' title ' ) )
2012-11-28 02:04:46 +01:00
video_uploader = None
mobj = re . search ( r ' (?im)<span class= " owner[^ \ " ]+? " >[^<]+?<a [^>]+?>([^<]+?)</a> ' , webpage )
if mobj is None :
# lookin for official user
mobj_official = re . search ( r ' <span rel= " author " [^>]+?>([^<]+?)</span> ' , webpage )
if mobj_official is None :
2013-03-05 21:13:17 +01:00
self . _downloader . report_warning ( u ' unable to extract uploader nickname ' )
2012-11-28 02:04:46 +01:00
else :
video_uploader = mobj_official . group ( 1 )
else :
video_uploader = mobj . group ( 1 )
video_upload_date = None
mobj = re . search ( r ' <div class= " [^ " ]*uploaded_cont[^ " ]* " title= " [^ " ]* " >([0-9] {2} )-([0-9] {2} )-([0-9] {4} )</div> ' , webpage )
if mobj is not None :
video_upload_date = mobj . group ( 3 ) + mobj . group ( 2 ) + mobj . group ( 1 )
return [ {
2012-12-15 18:23:17 +01:00
' id ' : video_id ,
' url ' : video_url ,
' uploader ' : video_uploader ,
2012-11-28 02:04:46 +01:00
' upload_date ' : video_upload_date ,
' title ' : video_title ,
2012-12-15 18:23:17 +01:00
' ext ' : video_extension ,
2012-11-28 02:04:46 +01:00
} ]
2012-03-25 03:07:37 +02:00
class PhotobucketIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for photobucket.com. """
_VALID_URL = r ' (?:http://)?(?:[a-z0-9]+ \ .)?photobucket \ .com/.*[ \ ? \ &]current=(.* \ .flv) '
IE_NAME = u ' photobucket '
def _real_extract ( self , url ) :
# Extract id from URL
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
video_id = mobj . group ( 1 )
video_extension = ' flv '
# Retrieve video webpage to extract further information
request = compat_urllib_request . Request ( url )
try :
self . report_download_webpage ( video_id )
webpage = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Unable to retrieve video webpage: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
# Extract URL, uploader, and title from webpage
self . report_extraction ( video_id )
mobj = re . search ( r ' <link rel= " video_src " href= " .* \ ?file=([^ " ]+) " /> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract media URL ' )
2012-11-28 02:04:46 +01:00
return
mediaURL = compat_urllib_parse . unquote ( mobj . group ( 1 ) )
video_url = mediaURL
mobj = re . search ( r ' <title>(.*) video by (.*) - Photobucket</title> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract title ' )
2012-11-28 02:04:46 +01:00
return
video_title = mobj . group ( 1 ) . decode ( ' utf-8 ' )
video_uploader = mobj . group ( 2 ) . decode ( ' utf-8 ' )
return [ {
' id ' : video_id . decode ( ' utf-8 ' ) ,
' url ' : video_url . decode ( ' utf-8 ' ) ,
' uploader ' : video_uploader ,
' upload_date ' : None ,
' title ' : video_title ,
' ext ' : video_extension . decode ( ' utf-8 ' ) ,
} ]
2012-03-25 03:07:37 +02:00
class YahooIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for video.yahoo.com. """
2012-12-17 18:33:11 +01:00
_WORKING = False
2012-11-28 02:04:46 +01:00
# _VALID_URL matches all Yahoo! Video URLs
# _VPAGE_URL matches only the extractable '/watch/' URLs
_VALID_URL = r ' (?:http://)?(?:[a-z]+ \ .)?video \ .yahoo \ .com/(?:watch|network)/([0-9]+)(?:/| \ ?v=)([0-9]+)(?:[# \ ?].*)? '
_VPAGE_URL = r ' (?:http://)?video \ .yahoo \ .com/watch/([0-9]+)/([0-9]+)(?:[# \ ?].*)? '
IE_NAME = u ' video.yahoo '
def _real_extract ( self , url , new_video = True ) :
# Extract ID from URL
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
video_id = mobj . group ( 2 )
video_extension = ' flv '
# Rewrite valid but non-extractable URLs as
# extractable English language /watch/ URLs
if re . match ( self . _VPAGE_URL , url ) is None :
request = compat_urllib_request . Request ( url )
try :
webpage = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Unable to retrieve video webpage: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
mobj = re . search ( r ' \ ( " id " , " ([0-9]+) " \ ); ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Unable to extract id field ' )
2012-11-28 02:04:46 +01:00
return
yahoo_id = mobj . group ( 1 )
mobj = re . search ( r ' \ ( " vid " , " ([0-9]+) " \ ); ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Unable to extract vid field ' )
2012-11-28 02:04:46 +01:00
return
yahoo_vid = mobj . group ( 1 )
url = ' http://video.yahoo.com/watch/ %s / %s ' % ( yahoo_vid , yahoo_id )
return self . _real_extract ( url , new_video = False )
# Retrieve video webpage to extract further information
request = compat_urllib_request . Request ( url )
try :
self . report_download_webpage ( video_id )
webpage = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Unable to retrieve video webpage: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
# Extract uploader and title from webpage
self . report_extraction ( video_id )
mobj = re . search ( r ' <meta name= " title " content= " (.*) " /> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video title ' )
2012-11-28 02:04:46 +01:00
return
video_title = mobj . group ( 1 ) . decode ( ' utf-8 ' )
mobj = re . search ( r ' <h2 class= " ti-5 " ><a href= " http://video \ .yahoo \ .com/(people|profile)/[0-9]+ " beacon= " .* " >(.*)</a></h2> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video uploader ' )
2012-11-28 02:04:46 +01:00
return
video_uploader = mobj . group ( 1 ) . decode ( ' utf-8 ' )
# Extract video thumbnail
mobj = re . search ( r ' <link rel= " image_src " href= " (.*) " /> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video thumbnail ' )
2012-11-28 02:04:46 +01:00
return
video_thumbnail = mobj . group ( 1 ) . decode ( ' utf-8 ' )
# Extract video description
mobj = re . search ( r ' <meta name= " description " content= " (.*) " /> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video description ' )
2012-11-28 02:04:46 +01:00
return
video_description = mobj . group ( 1 ) . decode ( ' utf-8 ' )
if not video_description :
video_description = ' No description available. '
# Extract video height and width
mobj = re . search ( r ' <meta name= " video_height " content= " ([0-9]+) " /> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video height ' )
2012-11-28 02:04:46 +01:00
return
yv_video_height = mobj . group ( 1 )
mobj = re . search ( r ' <meta name= " video_width " content= " ([0-9]+) " /> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video width ' )
2012-11-28 02:04:46 +01:00
return
yv_video_width = mobj . group ( 1 )
# Retrieve video playlist to extract media URL
# I'm not completely sure what all these options are, but we
# seem to need most of them, otherwise the server sends a 401.
yv_lg = ' R0xx6idZnW2zlrKP8xxAIR ' # not sure what this represents
yv_bitrate = ' 700 ' # according to Wikipedia this is hard-coded
request = compat_urllib_request . Request ( ' http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id= ' + video_id +
' &tech=flash&mode=playlist&lg= ' + yv_lg + ' &bitrate= ' + yv_bitrate + ' &vidH= ' + yv_video_height +
' &vidW= ' + yv_video_width + ' &swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797 ' )
try :
self . report_download_webpage ( video_id )
webpage = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Unable to retrieve video webpage: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
# Extract media URL from playlist XML
mobj = re . search ( r ' <STREAM APP= " (http://.*) " FULLPATH= " /?(/.* \ .flv \ ?[^ " ]*) " ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Unable to extract media URL ' )
2012-11-28 02:04:46 +01:00
return
video_url = compat_urllib_parse . unquote ( mobj . group ( 1 ) + mobj . group ( 2 ) ) . decode ( ' utf-8 ' )
video_url = unescapeHTML ( video_url )
return [ {
' id ' : video_id . decode ( ' utf-8 ' ) ,
' url ' : video_url ,
' uploader ' : video_uploader ,
' upload_date ' : None ,
' title ' : video_title ,
' ext ' : video_extension . decode ( ' utf-8 ' ) ,
' thumbnail ' : video_thumbnail . decode ( ' utf-8 ' ) ,
' description ' : video_description ,
} ]
2012-03-25 03:07:37 +02:00
class VimeoIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for vimeo.com. """
# _VALID_URL matches Vimeo URLs
2013-02-05 13:42:08 +01:00
_VALID_URL = r ' (?P<proto>https?://)?(?:(?:www|player) \ .)?vimeo \ .com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls \ ?clip_id=)?(?:videos?/)?(?P<id>[0-9]+) '
2012-11-28 02:04:46 +01:00
IE_NAME = u ' vimeo '
def _real_extract ( self , url , new_video = True ) :
# Extract ID from URL
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
2013-02-05 13:42:08 +01:00
video_id = mobj . group ( ' id ' )
if not mobj . group ( ' proto ' ) :
url = ' https:// ' + url
if mobj . group ( ' direct_link ' ) :
url = ' https://vimeo.com/ ' + video_id
2012-11-28 02:04:46 +01:00
# Retrieve video webpage to extract further information
request = compat_urllib_request . Request ( url , None , std_headers )
2013-05-02 18:18:27 +02:00
webpage = self . _download_webpage ( request , video_id )
2012-11-28 02:04:46 +01:00
# Now we begin extracting as much information as we can from what we
# retrieved. First we extract the information common to all extractors,
# and latter we extract those that are Vimeo specific.
self . report_extraction ( video_id )
# Extract the config JSON
try :
2012-12-11 11:33:15 +01:00
config = webpage . split ( ' = { config: ' ) [ 1 ] . split ( ' ,assets: ' ) [ 0 ]
2012-11-28 02:04:46 +01:00
config = json . loads ( config )
except :
2013-04-26 19:25:17 +02:00
if re . search ( ' The creator of this video has not given you permission to embed it on this domain. ' , webpage ) :
self . _downloader . report_error ( u ' The author has restricted the access to this video, try with the " --referer " option ' )
else :
self . _downloader . report_error ( u ' unable to extract info section ' )
2012-11-28 02:04:46 +01:00
return
2012-12-19 15:19:08 +01:00
2012-11-28 02:04:46 +01:00
# Extract title
video_title = config [ " video " ] [ " title " ]
2012-12-20 16:28:16 +01:00
# Extract uploader and uploader_id
2012-11-28 02:04:46 +01:00
video_uploader = config [ " video " ] [ " owner " ] [ " name " ]
2012-12-20 16:28:16 +01:00
video_uploader_id = config [ " video " ] [ " owner " ] [ " url " ] . split ( ' / ' ) [ - 1 ]
2012-11-28 02:04:46 +01:00
# Extract video thumbnail
video_thumbnail = config [ " video " ] [ " thumbnail " ]
# Extract video description
2012-12-19 15:21:39 +01:00
video_description = get_element_by_attribute ( " itemprop " , " description " , webpage )
2012-11-28 02:04:46 +01:00
if video_description : video_description = clean_html ( video_description )
2013-04-11 06:27:04 +02:00
else : video_description = u ' '
2012-11-28 02:04:46 +01:00
# Extract upload date
video_upload_date = None
2012-12-20 16:30:55 +01:00
mobj = re . search ( r ' <meta itemprop= " dateCreated " content= " ( \ d {4} )-( \ d {2} )-( \ d {2} )T ' , webpage )
2012-11-28 02:04:46 +01:00
if mobj is not None :
2012-12-20 16:30:55 +01:00
video_upload_date = mobj . group ( 1 ) + mobj . group ( 2 ) + mobj . group ( 3 )
2012-11-28 02:04:46 +01:00
# Vimeo specific: extract request signature and timestamp
sig = config [ ' request ' ] [ ' signature ' ]
timestamp = config [ ' request ' ] [ ' timestamp ' ]
# Vimeo specific: extract video codec and quality information
# First consider quality, then codecs, then take everything
# TODO bind to format param
codecs = [ ( ' h264 ' , ' mp4 ' ) , ( ' vp8 ' , ' flv ' ) , ( ' vp6 ' , ' flv ' ) ]
files = { ' hd ' : [ ] , ' sd ' : [ ] , ' other ' : [ ] }
for codec_name , codec_extension in codecs :
if codec_name in config [ " video " ] [ " files " ] :
if ' hd ' in config [ " video " ] [ " files " ] [ codec_name ] :
files [ ' hd ' ] . append ( ( codec_name , codec_extension , ' hd ' ) )
elif ' sd ' in config [ " video " ] [ " files " ] [ codec_name ] :
files [ ' sd ' ] . append ( ( codec_name , codec_extension , ' sd ' ) )
else :
files [ ' other ' ] . append ( ( codec_name , codec_extension , config [ " video " ] [ " files " ] [ codec_name ] [ 0 ] ) )
for quality in ( ' hd ' , ' sd ' , ' other ' ) :
if len ( files [ quality ] ) > 0 :
video_quality = files [ quality ] [ 0 ] [ 2 ]
video_codec = files [ quality ] [ 0 ] [ 0 ]
video_extension = files [ quality ] [ 0 ] [ 1 ]
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : Downloading %s file at %s quality ' % ( video_id , video_codec . upper ( ) , video_quality ) )
2012-11-28 02:04:46 +01:00
break
else :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' no known codec found ' )
2012-11-28 02:04:46 +01:00
return
video_url = " http://player.vimeo.com/play_redirect?clip_id= %s &sig= %s &time= %s &quality= %s &codecs= %s &type=moogaloop_local&embed_location= " \
% ( video_id , sig , timestamp , video_quality , video_codec . upper ( ) )
return [ {
' id ' : video_id ,
' url ' : video_url ,
' uploader ' : video_uploader ,
2012-12-20 16:28:16 +01:00
' uploader_id ' : video_uploader_id ,
2012-11-28 02:04:46 +01:00
' upload_date ' : video_upload_date ,
' title ' : video_title ,
' ext ' : video_extension ,
' thumbnail ' : video_thumbnail ,
' description ' : video_description ,
} ]
2012-03-25 03:07:37 +02:00
2012-08-26 09:11:19 +02:00
class ArteTvIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" arte.tv information extractor. """
_VALID_URL = r ' (?:http://)?videos \ .arte \ .tv/(?:fr|de)/videos/.* '
_LIVE_URL = r ' index-[0-9]+ \ .html$ '
IE_NAME = u ' arte.tv '
def fetch_webpage ( self , url ) :
request = compat_urllib_request . Request ( url )
try :
self . report_download_webpage ( url )
webpage = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Unable to retrieve video webpage: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
except ValueError as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
return webpage
def grep_webpage ( self , url , regex , regexFlags , matchTuples ) :
page = self . fetch_webpage ( url )
mobj = re . search ( regex , page , regexFlags )
info = { }
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
for ( i , key , err ) in matchTuples :
if mobj . group ( i ) is None :
2013-04-23 11:31:37 +02:00
self . _downloader . report_error ( err )
2012-11-28 02:04:46 +01:00
return
else :
info [ key ] = mobj . group ( i )
return info
def extractLiveStream ( self , url ) :
video_lang = url . split ( ' / ' ) [ - 4 ]
info = self . grep_webpage (
url ,
r ' src= " (.*?/videothek_js.*? \ .js) ' ,
0 ,
[
2013-04-23 11:31:37 +02:00
( 1 , ' url ' , u ' Invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
]
)
http_host = url . split ( ' / ' ) [ 2 ]
next_url = ' http:// %s %s ' % ( http_host , compat_urllib_parse . unquote ( info . get ( ' url ' ) ) )
info = self . grep_webpage (
next_url ,
r ' (s_artestras_scst_geoFRDE_ ' + video_lang + ' .*?) \' .*? ' +
' (http://.*? \ .swf).*? ' +
' (rtmp://.*?) \' ' ,
re . DOTALL ,
[
2013-04-23 11:31:37 +02:00
( 1 , ' path ' , u ' could not extract video path: %s ' % url ) ,
( 2 , ' player ' , u ' could not extract video player: %s ' % url ) ,
( 3 , ' url ' , u ' could not extract video url: %s ' % url )
2012-11-28 02:04:46 +01:00
]
)
video_url = u ' %s / %s ' % ( info . get ( ' url ' ) , info . get ( ' path ' ) )
def extractPlus7Stream ( self , url ) :
video_lang = url . split ( ' / ' ) [ - 3 ]
info = self . grep_webpage (
url ,
r ' param name= " movie " .*?videorefFileUrl=(http[^ \' " &]*) ' ,
0 ,
[
2013-04-23 11:31:37 +02:00
( 1 , ' url ' , u ' Invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
]
)
next_url = compat_urllib_parse . unquote ( info . get ( ' url ' ) )
info = self . grep_webpage (
next_url ,
r ' <video lang= " %s " ref= " (http[^ \' " &]*) ' % video_lang ,
0 ,
[
2013-04-23 11:31:37 +02:00
( 1 , ' url ' , u ' Could not find <video> tag: %s ' % url )
2012-11-28 02:04:46 +01:00
]
)
next_url = compat_urllib_parse . unquote ( info . get ( ' url ' ) )
info = self . grep_webpage (
next_url ,
r ' <video id= " (.*?) " .*?>.*? ' +
' <name>(.*?)</name>.*? ' +
' <dateVideo>(.*?)</dateVideo>.*? ' +
' <url quality= " hd " >(.*?)</url> ' ,
re . DOTALL ,
[
2013-04-23 11:31:37 +02:00
( 1 , ' id ' , u ' could not extract video id: %s ' % url ) ,
( 2 , ' title ' , u ' could not extract video title: %s ' % url ) ,
( 3 , ' date ' , u ' could not extract video date: %s ' % url ) ,
( 4 , ' url ' , u ' could not extract video url: %s ' % url )
2012-11-28 02:04:46 +01:00
]
)
return {
' id ' : info . get ( ' id ' ) ,
' url ' : compat_urllib_parse . unquote ( info . get ( ' url ' ) ) ,
' uploader ' : u ' arte.tv ' ,
' upload_date ' : info . get ( ' date ' ) ,
2012-12-17 18:33:11 +01:00
' title ' : info . get ( ' title ' ) . decode ( ' utf-8 ' ) ,
2012-11-28 02:04:46 +01:00
' ext ' : u ' mp4 ' ,
' format ' : u ' NA ' ,
' player_url ' : None ,
}
def _real_extract ( self , url ) :
video_id = url . split ( ' / ' ) [ - 1 ]
self . report_extraction ( video_id )
if re . search ( self . _LIVE_URL , video_id ) is not None :
self . extractLiveStream ( url )
return
else :
info = self . extractPlus7Stream ( url )
return [ info ]
2012-08-26 09:11:19 +02:00
2012-03-25 03:07:37 +02:00
class GenericIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Generic last-resort information extractor. """
_VALID_URL = r ' .* '
IE_NAME = u ' generic '
def report_download_webpage ( self , video_id ) :
""" Report webpage download. """
2013-03-08 20:47:06 +01:00
if not self . _downloader . params . get ( ' test ' , False ) :
2013-04-20 19:35:49 +02:00
self . _downloader . report_warning ( u ' Falling back on generic information extractor. ' )
2013-04-24 22:11:57 +02:00
super ( GenericIE , self ) . report_download_webpage ( video_id )
2012-11-28 02:04:46 +01:00
def report_following_redirect ( self , new_url ) :
""" Report information extraction. """
self . _downloader . to_screen ( u ' [redirect] Following redirect to %s ' % new_url )
2012-12-19 15:19:08 +01:00
2012-11-28 02:04:46 +01:00
def _test_redirect ( self , url ) :
2013-03-05 22:33:32 +01:00
""" Check if it is a redirect, like url shorteners, in case return the new url. """
2012-11-28 02:04:46 +01:00
class HeadRequest ( compat_urllib_request . Request ) :
def get_method ( self ) :
return " HEAD "
class HEADRedirectHandler ( compat_urllib_request . HTTPRedirectHandler ) :
"""
2012-12-19 15:19:08 +01:00
Subclass the HTTPRedirectHandler to make it use our
2012-11-28 02:04:46 +01:00
HeadRequest also on the redirected URL
"""
2012-12-19 15:19:08 +01:00
def redirect_request ( self , req , fp , code , msg , headers , newurl ) :
2012-11-28 02:04:46 +01:00
if code in ( 301 , 302 , 303 , 307 ) :
2012-12-19 15:19:08 +01:00
newurl = newurl . replace ( ' ' , ' % 20 ' )
2012-11-28 02:04:46 +01:00
newheaders = dict ( ( k , v ) for k , v in req . headers . items ( )
if k . lower ( ) not in ( " content-length " , " content-type " ) )
2012-12-19 15:19:08 +01:00
return HeadRequest ( newurl ,
2012-11-28 02:04:46 +01:00
headers = newheaders ,
2012-12-19 15:19:08 +01:00
origin_req_host = req . get_origin_req_host ( ) ,
unverifiable = True )
else :
raise compat_urllib_error . HTTPError ( req . get_full_url ( ) , code , msg , headers , fp )
2012-11-28 02:04:46 +01:00
class HTTPMethodFallback ( compat_urllib_request . BaseHandler ) :
"""
Fallback to GET if HEAD is not allowed ( 405 HTTP error )
"""
2012-12-19 15:19:08 +01:00
def http_error_405 ( self , req , fp , code , msg , headers ) :
2012-11-28 02:04:46 +01:00
fp . read ( )
fp . close ( )
newheaders = dict ( ( k , v ) for k , v in req . headers . items ( )
if k . lower ( ) not in ( " content-length " , " content-type " ) )
2012-12-19 15:19:08 +01:00
return self . parent . open ( compat_urllib_request . Request ( req . get_full_url ( ) ,
headers = newheaders ,
origin_req_host = req . get_origin_req_host ( ) ,
2012-11-28 02:04:46 +01:00
unverifiable = True ) )
# Build our opener
2012-12-19 15:19:08 +01:00
opener = compat_urllib_request . OpenerDirector ( )
2012-11-28 02:04:46 +01:00
for handler in [ compat_urllib_request . HTTPHandler , compat_urllib_request . HTTPDefaultErrorHandler ,
HTTPMethodFallback , HEADRedirectHandler ,
2013-02-21 16:49:05 +01:00
compat_urllib_request . HTTPErrorProcessor , compat_urllib_request . HTTPSHandler ] :
2012-11-28 02:04:46 +01:00
opener . add_handler ( handler ( ) )
response = opener . open ( HeadRequest ( url ) )
new_url = response . geturl ( )
if url == new_url :
return False
self . report_following_redirect ( new_url )
2013-03-05 22:33:32 +01:00
return new_url
2012-11-28 02:04:46 +01:00
def _real_extract ( self , url ) :
2013-03-05 22:33:32 +01:00
new_url = self . _test_redirect ( url )
if new_url : return [ self . url_result ( new_url ) ]
2012-11-28 02:04:46 +01:00
video_id = url . split ( ' / ' ) [ - 1 ]
try :
2013-03-08 20:47:06 +01:00
webpage = self . _download_webpage ( url , video_id )
2012-11-28 02:04:46 +01:00
except ValueError as err :
# since this is the last-resort InfoExtractor, if
# this error is thrown, it'll be thrown here
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
self . report_extraction ( video_id )
# Start with something easy: JW Player in SWFObject
mobj = re . search ( r ' flashvars: [ \' " ](?:.*&)?file=(http[^ \' " &]*) ' , webpage )
if mobj is None :
# Broaden the search a little bit
mobj = re . search ( r ' [^A-Za-z0-9]?(?:file|source)=(http[^ \' " &]*) ' , webpage )
2013-02-21 16:56:48 +01:00
if mobj is None :
# Broaden the search a little bit: JWPlayer JS loader
mobj = re . search ( r ' [^A-Za-z0-9]?file: \ s*[ " \' ](http[^ \' " &]*) ' , webpage )
2012-11-28 02:04:46 +01:00
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
# It's possible that one of the regexes
# matched, but returned an empty group:
if mobj . group ( 1 ) is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
video_url = compat_urllib_parse . unquote ( mobj . group ( 1 ) )
video_id = os . path . basename ( video_url )
# here's a fun little line of code for you:
video_extension = os . path . splitext ( video_id ) [ 1 ] [ 1 : ]
video_id = os . path . splitext ( video_id ) [ 0 ]
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
mobj = re . search ( r ' <title>(.*)</title> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract title ' )
2012-11-28 02:04:46 +01:00
return
2012-12-15 18:25:00 +01:00
video_title = mobj . group ( 1 )
2012-11-28 02:04:46 +01:00
# video uploader is domain name
mobj = re . match ( r ' (?:https?://)?([^/]*)/.* ' , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract title ' )
2012-11-28 02:04:46 +01:00
return
2012-12-15 18:25:00 +01:00
video_uploader = mobj . group ( 1 )
2012-11-28 02:04:46 +01:00
return [ {
2012-12-15 18:25:00 +01:00
' id ' : video_id ,
' url ' : video_url ,
2012-11-28 02:04:46 +01:00
' uploader ' : video_uploader ,
' upload_date ' : None ,
' title ' : video_title ,
2012-12-15 18:25:00 +01:00
' ext ' : video_extension ,
2012-11-28 02:04:46 +01:00
} ]
2012-03-25 03:07:37 +02:00
class YoutubeSearchIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information Extractor for YouTube search queries. """
_VALID_URL = r ' ytsearch( \ d+|all)?:[ \ s \ S]+ '
_API_URL = ' https://gdata.youtube.com/feeds/api/videos?q= %s &start-index= %i &max-results=50&v=2&alt=jsonc '
_max_youtube_results = 1000
IE_NAME = u ' youtube:search '
def report_download_page ( self , query , pagenum ) :
""" Report attempt to download search page with given number. """
query = query . decode ( preferredencoding ( ) )
self . _downloader . to_screen ( u ' [youtube] query " %s " : Downloading page %s ' % ( query , pagenum ) )
def _real_extract ( self , query ) :
mobj = re . match ( self . _VALID_URL , query )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid search query " %s " ' % query )
2012-11-28 02:04:46 +01:00
return
prefix , query = query . split ( ' : ' )
prefix = prefix [ 8 : ]
query = query . encode ( ' utf-8 ' )
if prefix == ' ' :
2013-04-20 19:22:45 +02:00
return self . _get_n_results ( query , 1 )
2012-11-28 02:04:46 +01:00
elif prefix == ' all ' :
2013-04-20 19:22:45 +02:00
self . _get_n_results ( query , self . _max_youtube_results )
2012-11-28 02:04:46 +01:00
else :
try :
n = int ( prefix )
if n < = 0 :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid download number %s for query " %s " ' % ( n , query ) )
2012-11-28 02:04:46 +01:00
return
elif n > self . _max_youtube_results :
2013-03-02 11:24:07 +01:00
self . _downloader . report_warning ( u ' ytsearch returns max %i results (you requested %i ) ' % ( self . _max_youtube_results , n ) )
2012-11-28 02:04:46 +01:00
n = self . _max_youtube_results
2013-04-20 19:22:45 +02:00
return self . _get_n_results ( query , n )
2012-11-28 02:04:46 +01:00
except ValueError : # parsing prefix as integer fails
2013-04-20 19:22:45 +02:00
return self . _get_n_results ( query , 1 )
2012-11-28 02:04:46 +01:00
2013-04-20 19:22:45 +02:00
def _get_n_results ( self , query , n ) :
""" Get a specified number of results for a query """
2012-11-28 02:04:46 +01:00
video_ids = [ ]
pagenum = 0
limit = n
while ( 50 * pagenum ) < limit :
self . report_download_page ( query , pagenum + 1 )
result_url = self . _API_URL % ( compat_urllib_parse . quote_plus ( query ) , ( 50 * pagenum ) + 1 )
request = compat_urllib_request . Request ( result_url )
try :
2013-02-23 22:47:22 +01:00
data = compat_urllib_request . urlopen ( request ) . read ( ) . decode ( ' utf-8 ' )
2012-11-28 02:04:46 +01:00
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to download API page: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
api_response = json . loads ( data ) [ ' data ' ]
2013-02-26 18:06:43 +01:00
if not ' items ' in api_response :
2013-04-23 11:31:37 +02:00
self . _downloader . report_error ( u ' [youtube] No video results ' )
2013-02-26 18:06:43 +01:00
return
2012-11-28 02:04:46 +01:00
new_ids = list ( video [ ' id ' ] for video in api_response [ ' items ' ] )
video_ids + = new_ids
limit = min ( n , api_response [ ' totalItems ' ] )
pagenum + = 1
if len ( video_ids ) > n :
video_ids = video_ids [ : n ]
2013-04-20 19:22:45 +02:00
videos = [ self . url_result ( ' http://www.youtube.com/watch?v= %s ' % id , ' Youtube ' ) for id in video_ids ]
return videos
2012-03-25 03:07:37 +02:00
class GoogleSearchIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information Extractor for Google Video search queries. """
_VALID_URL = r ' gvsearch( \ d+|all)?:[ \ s \ S]+ '
_TEMPLATE_URL = ' http://video.google.com/videosearch?q= %s +site:video.google.com&start= %s &hl=en '
_VIDEO_INDICATOR = r ' <a href= " http://video \ .google \ .com/videoplay \ ?docid=([^ " \ &]+) '
_MORE_PAGES_INDICATOR = r ' class= " pn " id= " pnnext " '
_max_google_results = 1000
IE_NAME = u ' video.google:search '
def report_download_page ( self , query , pagenum ) :
""" Report attempt to download playlist page with given number. """
query = query . decode ( preferredencoding ( ) )
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' query " %s " : Downloading page %s ' % ( query , pagenum ) )
2012-11-28 02:04:46 +01:00
def _real_extract ( self , query ) :
mobj = re . match ( self . _VALID_URL , query )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid search query " %s " ' % query )
2012-11-28 02:04:46 +01:00
return
prefix , query = query . split ( ' : ' )
prefix = prefix [ 8 : ]
query = query . encode ( ' utf-8 ' )
if prefix == ' ' :
self . _download_n_results ( query , 1 )
return
elif prefix == ' all ' :
self . _download_n_results ( query , self . _max_google_results )
return
else :
try :
n = int ( prefix )
if n < = 0 :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid download number %s for query " %s " ' % ( n , query ) )
2012-11-28 02:04:46 +01:00
return
elif n > self . _max_google_results :
2013-03-02 11:24:07 +01:00
self . _downloader . report_warning ( u ' gvsearch returns max %i results (you requested %i ) ' % ( self . _max_google_results , n ) )
2012-11-28 02:04:46 +01:00
n = self . _max_google_results
self . _download_n_results ( query , n )
return
except ValueError : # parsing prefix as integer fails
self . _download_n_results ( query , 1 )
return
def _download_n_results ( self , query , n ) :
""" Downloads a specified number of results for a query """
video_ids = [ ]
pagenum = 0
while True :
self . report_download_page ( query , pagenum )
result_url = self . _TEMPLATE_URL % ( compat_urllib_parse . quote_plus ( query ) , pagenum * 10 )
request = compat_urllib_request . Request ( result_url )
try :
page = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to download webpage: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
# Extract video identifiers
for mobj in re . finditer ( self . _VIDEO_INDICATOR , page ) :
video_id = mobj . group ( 1 )
if video_id not in video_ids :
video_ids . append ( video_id )
if len ( video_ids ) == n :
# Specified n videos reached
for id in video_ids :
self . _downloader . download ( [ ' http://video.google.com/videoplay?docid= %s ' % id ] )
return
if re . search ( self . _MORE_PAGES_INDICATOR , page ) is None :
for id in video_ids :
self . _downloader . download ( [ ' http://video.google.com/videoplay?docid= %s ' % id ] )
return
pagenum = pagenum + 1
2012-03-25 03:07:37 +02:00
class YahooSearchIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information Extractor for Yahoo! Video search queries. """
2012-12-17 18:33:11 +01:00
_WORKING = False
2012-11-28 02:04:46 +01:00
_VALID_URL = r ' yvsearch( \ d+|all)?:[ \ s \ S]+ '
_TEMPLATE_URL = ' http://video.yahoo.com/search/?p= %s &o= %s '
_VIDEO_INDICATOR = r ' href= " http://video \ .yahoo \ .com/watch/([0-9]+/[0-9]+) " '
_MORE_PAGES_INDICATOR = r ' \ s*Next '
_max_yahoo_results = 1000
IE_NAME = u ' video.yahoo:search '
def report_download_page ( self , query , pagenum ) :
""" Report attempt to download playlist page with given number. """
query = query . decode ( preferredencoding ( ) )
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' query " %s " : Downloading page %s ' % ( query , pagenum ) )
2012-11-28 02:04:46 +01:00
def _real_extract ( self , query ) :
mobj = re . match ( self . _VALID_URL , query )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid search query " %s " ' % query )
2012-11-28 02:04:46 +01:00
return
prefix , query = query . split ( ' : ' )
prefix = prefix [ 8 : ]
query = query . encode ( ' utf-8 ' )
if prefix == ' ' :
self . _download_n_results ( query , 1 )
return
elif prefix == ' all ' :
self . _download_n_results ( query , self . _max_yahoo_results )
return
else :
try :
n = int ( prefix )
if n < = 0 :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid download number %s for query " %s " ' % ( n , query ) )
2012-11-28 02:04:46 +01:00
return
elif n > self . _max_yahoo_results :
2013-03-02 11:24:07 +01:00
self . _downloader . report_warning ( u ' yvsearch returns max %i results (you requested %i ) ' % ( self . _max_yahoo_results , n ) )
2012-11-28 02:04:46 +01:00
n = self . _max_yahoo_results
self . _download_n_results ( query , n )
return
except ValueError : # parsing prefix as integer fails
self . _download_n_results ( query , 1 )
return
def _download_n_results ( self , query , n ) :
""" Downloads a specified number of results for a query """
video_ids = [ ]
already_seen = set ( )
pagenum = 1
while True :
self . report_download_page ( query , pagenum )
result_url = self . _TEMPLATE_URL % ( compat_urllib_parse . quote_plus ( query ) , pagenum )
request = compat_urllib_request . Request ( result_url )
try :
page = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to download webpage: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
# Extract video identifiers
for mobj in re . finditer ( self . _VIDEO_INDICATOR , page ) :
video_id = mobj . group ( 1 )
if video_id not in already_seen :
video_ids . append ( video_id )
already_seen . add ( video_id )
if len ( video_ids ) == n :
# Specified n videos reached
for id in video_ids :
self . _downloader . download ( [ ' http://video.yahoo.com/watch/ %s ' % id ] )
return
if re . search ( self . _MORE_PAGES_INDICATOR , page ) is None :
for id in video_ids :
self . _downloader . download ( [ ' http://video.yahoo.com/watch/ %s ' % id ] )
return
pagenum = pagenum + 1
2012-03-25 03:07:37 +02:00
class YoutubePlaylistIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information Extractor for YouTube playlists. """
2013-02-26 10:39:26 +01:00
_VALID_URL = r """ (?:
( ? : https ? : / / ) ?
( ? : \w + \. ) ?
youtube \. com /
( ? :
2013-02-26 19:02:31 +01:00
( ? : course | view_play_list | my_playlists | artist | playlist | watch )
\? ( ? : . * ? & ) * ? ( ? : p | a | list ) =
2013-02-26 10:39:26 +01:00
| p /
)
2013-02-26 19:02:31 +01:00
( ( ? : PL | EC | UU ) ? [ 0 - 9 A - Za - z - _ ] { 10 , } )
. *
|
( ( ? : PL | EC | UU ) [ 0 - 9 A - Za - z - _ ] { 10 , } )
) """
2013-02-26 10:39:26 +01:00
_TEMPLATE_URL = ' https://gdata.youtube.com/feeds/api/playlists/ %s ?max-results= %i &start-index= %i &v=2&alt=json '
_MAX_RESULTS = 50
2012-11-28 02:04:46 +01:00
IE_NAME = u ' youtube:playlist '
2013-02-26 19:02:31 +01:00
@classmethod
def suitable ( cls , url ) :
2013-02-26 10:39:26 +01:00
""" Receives a URL and returns True if suitable for this IE. """
2013-02-26 19:02:31 +01:00
return re . match ( cls . _VALID_URL , url , re . VERBOSE ) is not None
2013-02-26 10:39:26 +01:00
2012-11-28 02:04:46 +01:00
def _real_extract ( self , url ) :
# Extract playlist id
2013-02-26 10:39:26 +01:00
mobj = re . match ( self . _VALID_URL , url , re . VERBOSE )
2012-11-28 02:04:46 +01:00
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid url: %s ' % url )
2012-11-28 02:04:46 +01:00
return
2013-02-26 10:39:26 +01:00
# Download playlist videos from API
2013-02-26 19:02:31 +01:00
playlist_id = mobj . group ( 1 ) or mobj . group ( 2 )
2013-02-26 10:39:26 +01:00
page_num = 1
videos = [ ]
2012-11-28 02:04:46 +01:00
while True :
2013-02-26 10:39:26 +01:00
url = self . _TEMPLATE_URL % ( playlist_id , self . _MAX_RESULTS , self . _MAX_RESULTS * ( page_num - 1 ) + 1 )
2013-05-02 18:18:27 +02:00
page = self . _download_webpage ( url , playlist_id , u ' Downloading page # %s ' % page_num )
2012-11-28 02:04:46 +01:00
2013-02-26 10:39:26 +01:00
try :
response = json . loads ( page )
except ValueError as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Invalid JSON in API response: ' + compat_str ( err ) )
2013-02-26 10:39:26 +01:00
return
2012-11-28 02:04:46 +01:00
2013-04-18 07:28:24 +02:00
if ' feed ' not in response :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Got a malformed response from YouTube API ' )
2013-02-26 19:02:31 +01:00
return
2013-04-27 10:41:52 +02:00
playlist_title = response [ ' feed ' ] [ ' title ' ] [ ' $t ' ]
2013-04-18 07:28:24 +02:00
if ' entry ' not in response [ ' feed ' ] :
# Number of videos is a multiple of self._MAX_RESULTS
break
2013-02-26 19:02:31 +01:00
videos + = [ ( entry [ ' yt$position ' ] [ ' $t ' ] , entry [ ' content ' ] [ ' src ' ] )
for entry in response [ ' feed ' ] [ ' entry ' ]
if ' content ' in entry ]
2013-02-26 10:39:26 +01:00
if len ( response [ ' feed ' ] [ ' entry ' ] ) < self . _MAX_RESULTS :
2012-11-28 02:04:46 +01:00
break
2013-02-26 10:39:26 +01:00
page_num + = 1
2012-11-28 02:04:46 +01:00
2013-02-26 21:21:50 +01:00
videos = [ v [ 1 ] for v in sorted ( videos ) ]
2012-12-11 16:45:46 +01:00
2013-04-20 12:50:14 +02:00
url_results = [ self . url_result ( url , ' Youtube ' ) for url in videos ]
2013-04-20 18:57:05 +02:00
return [ self . playlist_result ( url_results , playlist_id , playlist_title ) ]
2012-03-25 03:07:37 +02:00
2012-10-14 13:48:18 +02:00
class YoutubeChannelIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information Extractor for YouTube channels. """
2013-04-20 00:05:35 +02:00
_VALID_URL = r " ^(?:https?://)?(?:youtu \ .be|(?: \ w+ \ .)?youtube(?:-nocookie)? \ .com)/channel/([0-9A-Za-z_-]+) "
2012-11-28 02:04:46 +01:00
_TEMPLATE_URL = ' http://www.youtube.com/channel/ %s /videos?sort=da&flow=list&view=0&page= %s &gl=US&hl=en '
2013-04-20 00:05:35 +02:00
_MORE_PAGES_INDICATOR = ' yt-uix-load-more '
_MORE_PAGES_URL = ' http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging= %s &view=0&sort=da&channel_id= %s '
2012-11-28 02:04:46 +01:00
IE_NAME = u ' youtube:channel '
2013-04-20 00:05:35 +02:00
def extract_videos_from_page ( self , page ) :
ids_in_page = [ ]
for mobj in re . finditer ( r ' href= " /watch \ ?v=([0-9A-Za-z_-]+)&? ' , page ) :
if mobj . group ( 1 ) not in ids_in_page :
ids_in_page . append ( mobj . group ( 1 ) )
return ids_in_page
2012-11-28 02:04:46 +01:00
def _real_extract ( self , url ) :
# Extract channel id
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid url: %s ' % url )
2012-11-28 02:04:46 +01:00
return
2013-04-20 00:05:35 +02:00
# Download channel page
2012-11-28 02:04:46 +01:00
channel_id = mobj . group ( 1 )
video_ids = [ ]
pagenum = 1
2013-04-20 00:05:35 +02:00
url = self . _TEMPLATE_URL % ( channel_id , pagenum )
2013-05-02 18:18:27 +02:00
page = self . _download_webpage ( url , channel_id ,
u ' Downloading page # %s ' % pagenum )
2012-11-28 02:04:46 +01:00
2013-04-20 00:05:35 +02:00
# Extract video identifiers
ids_in_page = self . extract_videos_from_page ( page )
video_ids . extend ( ids_in_page )
2012-11-28 02:04:46 +01:00
2013-04-20 00:05:35 +02:00
# Download any subsequent channel pages using the json-based channel_ajax query
if self . _MORE_PAGES_INDICATOR in page :
while True :
pagenum = pagenum + 1
url = self . _MORE_PAGES_URL % ( pagenum , channel_id )
2013-05-02 18:18:27 +02:00
page = self . _download_webpage ( url , channel_id ,
u ' Downloading page # %s ' % pagenum )
2013-04-20 00:05:35 +02:00
page = json . loads ( page )
ids_in_page = self . extract_videos_from_page ( page [ ' content_html ' ] )
video_ids . extend ( ids_in_page )
if self . _MORE_PAGES_INDICATOR not in page [ ' load_more_widget_html ' ] :
break
2012-11-28 02:04:46 +01:00
2012-12-11 16:45:46 +01:00
self . _downloader . to_screen ( u ' [youtube] Channel %s : Found %i videos ' % ( channel_id , len ( video_ids ) ) )
2013-03-05 12:26:18 +01:00
urls = [ ' http://www.youtube.com/watch?v= %s ' % id for id in video_ids ]
2013-04-20 12:50:14 +02:00
url_entries = [ self . url_result ( url , ' Youtube ' ) for url in urls ]
2013-03-28 13:39:00 +01:00
return [ self . playlist_result ( url_entries , channel_id ) ]
2012-10-14 13:48:18 +02:00
2012-03-25 03:07:37 +02:00
class YoutubeUserIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information Extractor for YouTube users. """
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
_VALID_URL = r ' (?:(?:(?:https?://)?(?: \ w+ \ .)?youtube \ .com/user/)|ytuser:)([A-Za-z0-9_-]+) '
_TEMPLATE_URL = ' http://gdata.youtube.com/feeds/api/users/ %s '
_GDATA_PAGE_SIZE = 50
_GDATA_URL = ' http://gdata.youtube.com/feeds/api/users/ %s /uploads?max-results= %d &start-index= %d '
_VIDEO_INDICATOR = r ' /watch \ ?v=(.+?)[ \ <&] '
IE_NAME = u ' youtube:user '
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
def _real_extract ( self , url ) :
# Extract username
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid url: %s ' % url )
2012-11-28 02:04:46 +01:00
return
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
username = mobj . group ( 1 )
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
# Download video ids using YouTube Data API. Result size per
# query is limited (currently to 50 videos) so we need to query
# page by page until there are no video ids - it means we got
# all of them.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
video_ids = [ ]
pagenum = 0
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
while True :
start_index = pagenum * self . _GDATA_PAGE_SIZE + 1
2012-03-25 03:07:37 +02:00
2013-05-02 18:18:27 +02:00
gdata_url = self . _GDATA_URL % ( username , self . _GDATA_PAGE_SIZE , start_index )
page = self . _download_webpage ( gdata_url , username ,
u ' Downloading video ids from %d to %d ' % ( start_index , start_index + self . _GDATA_PAGE_SIZE ) )
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
# Extract video identifiers
ids_in_page = [ ]
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
for mobj in re . finditer ( self . _VIDEO_INDICATOR , page ) :
if mobj . group ( 1 ) not in ids_in_page :
ids_in_page . append ( mobj . group ( 1 ) )
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
video_ids . extend ( ids_in_page )
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
# A little optimization - if current page is not
# "full", ie. does not contain PAGE_SIZE video ids then
# we can assume that this page is the last one - there
# are no more ids on further pages - no need to query
# again.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
if len ( ids_in_page ) < self . _GDATA_PAGE_SIZE :
break
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
pagenum + = 1
2012-03-25 03:07:37 +02:00
2013-03-05 11:58:01 +01:00
urls = [ ' http://www.youtube.com/watch?v= %s ' % video_id for video_id in video_ids ]
2013-04-20 12:50:14 +02:00
url_results = [ self . url_result ( url , ' Youtube ' ) for url in urls ]
2013-03-28 13:39:00 +01:00
return [ self . playlist_result ( url_results , playlist_title = username ) ]
2012-03-25 03:07:37 +02:00
2012-06-06 18:16:16 +02:00
class BlipTVUserIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information Extractor for blip.tv users. """
2012-06-06 18:16:16 +02:00
2012-11-28 02:04:46 +01:00
_VALID_URL = r ' (?:(?:(?:https?://)?(?: \ w+ \ .)?blip \ .tv/)|bliptvuser:)([^/]+)/*$ '
_PAGE_SIZE = 12
IE_NAME = u ' blip.tv:user '
2012-06-06 18:16:16 +02:00
2012-11-28 02:04:46 +01:00
def _real_extract ( self , url ) :
# Extract username
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid url: %s ' % url )
2012-11-28 02:04:46 +01:00
return
2012-06-06 18:16:16 +02:00
2012-11-28 02:04:46 +01:00
username = mobj . group ( 1 )
2012-06-06 18:16:16 +02:00
2012-11-28 02:04:46 +01:00
page_base = ' http://m.blip.tv/pr/show_get_full_episode_list?users_id= %s &lite=0&esi=1 '
2012-06-06 18:16:16 +02:00
2013-05-02 18:18:27 +02:00
page = self . _download_webpage ( url , username , u ' Downloading user page ' )
mobj = re . search ( r ' data-users-id= " ([^ " ]+) " ' , page )
page_base = page_base % mobj . group ( 1 )
2012-06-06 18:16:16 +02:00
2012-11-28 02:04:46 +01:00
# Download video ids using BlipTV Ajax calls. Result size per
# query is limited (currently to 12 videos) so we need to query
# page by page until there are no video ids - it means we got
# all of them.
2012-06-06 18:16:16 +02:00
2012-11-28 02:04:46 +01:00
video_ids = [ ]
pagenum = 1
2012-06-06 18:16:16 +02:00
2012-11-28 02:04:46 +01:00
while True :
2013-02-24 23:23:50 +01:00
url = page_base + " &page= " + str ( pagenum )
2013-05-02 18:18:27 +02:00
page = self . _download_webpage ( url , username ,
u ' Downloading video ids from page %d ' % pagenum )
2012-06-06 18:16:16 +02:00
2012-11-28 02:04:46 +01:00
# Extract video identifiers
ids_in_page = [ ]
2012-06-06 18:16:16 +02:00
2012-11-28 02:04:46 +01:00
for mobj in re . finditer ( r ' href= " /([^ " ]+) " ' , page ) :
if mobj . group ( 1 ) not in ids_in_page :
ids_in_page . append ( unescapeHTML ( mobj . group ( 1 ) ) )
2012-06-06 18:16:16 +02:00
2012-11-28 02:04:46 +01:00
video_ids . extend ( ids_in_page )
2012-06-06 18:16:16 +02:00
2012-11-28 02:04:46 +01:00
# A little optimization - if current page is not
# "full", ie. does not contain PAGE_SIZE video ids then
# we can assume that this page is the last one - there
# are no more ids on further pages - no need to query
# again.
2012-06-06 18:16:16 +02:00
2012-11-28 02:04:46 +01:00
if len ( ids_in_page ) < self . _PAGE_SIZE :
break
2012-06-06 18:16:16 +02:00
2012-11-28 02:04:46 +01:00
pagenum + = 1
2012-06-06 18:16:16 +02:00
2013-03-05 12:26:18 +01:00
urls = [ u ' http://blip.tv/ %s ' % video_id for video_id in video_ids ]
2013-04-20 12:50:14 +02:00
url_entries = [ self . url_result ( url , ' BlipTV ' ) for url in urls ]
2013-03-28 13:39:00 +01:00
return [ self . playlist_result ( url_entries , playlist_title = username ) ]
2012-06-06 18:16:16 +02:00
2012-03-25 03:07:37 +02:00
class DepositFilesIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for depositfiles.com """
_VALID_URL = r ' (?:http://)?(?: \ w+ \ .)?depositfiles \ .com/(?:../(?#locale))?files/(.+) '
def _real_extract ( self , url ) :
file_id = url . split ( ' / ' ) [ - 1 ]
# Rebuild url in english locale
url = ' http://depositfiles.com/en/files/ ' + file_id
# Retrieve file webpage with 'Free download' button pressed
free_download_indication = { ' gateway_result ' : ' 1 ' }
request = compat_urllib_request . Request ( url , compat_urllib_parse . urlencode ( free_download_indication ) )
try :
self . report_download_webpage ( file_id )
webpage = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Unable to retrieve file webpage: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
# Search for the real file URL
mobj = re . search ( r ' <form action= " (http://fileshare.+?) " ' , webpage )
if ( mobj is None ) or ( mobj . group ( 1 ) is None ) :
# Try to figure out reason of the error.
mobj = re . search ( r ' <strong>(Attention.*?)</strong> ' , webpage , re . DOTALL )
if ( mobj is not None ) and ( mobj . group ( 1 ) is not None ) :
restriction_message = re . sub ( ' \ s+ ' , ' ' , mobj . group ( 1 ) ) . strip ( )
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' %s ' % restriction_message )
2012-11-28 02:04:46 +01:00
else :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract download URL from: %s ' % url )
2012-11-28 02:04:46 +01:00
return
file_url = mobj . group ( 1 )
file_extension = os . path . splitext ( file_url ) [ 1 ] [ 1 : ]
# Search for file title
mobj = re . search ( r ' <b title= " (.*?) " > ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract title ' )
2012-11-28 02:04:46 +01:00
return
file_title = mobj . group ( 1 ) . decode ( ' utf-8 ' )
return [ {
' id ' : file_id . decode ( ' utf-8 ' ) ,
' url ' : file_url . decode ( ' utf-8 ' ) ,
' uploader ' : None ,
' upload_date ' : None ,
' title ' : file_title ,
' ext ' : file_extension . decode ( ' utf-8 ' ) ,
} ]
2012-03-25 03:07:37 +02:00
class FacebookIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information Extractor for Facebook """
_VALID_URL = r ' ^(?:https?://)?(?: \ w+ \ .)?facebook \ .com/(?:video/video|photo) \ .php \ ?(?:.*?)v=(?P<ID> \ d+)(?:.*) '
_LOGIN_URL = ' https://login.facebook.com/login.php?m&next=http % 3A %2F %2F m.facebook.com %2F home.php& '
_NETRC_MACHINE = ' facebook '
IE_NAME = u ' facebook '
def report_login ( self ) :
""" Report attempt to log in. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Logging in ' )
2012-11-28 02:04:46 +01:00
def _real_initialize ( self ) :
if self . _downloader is None :
return
useremail = None
password = None
downloader_params = self . _downloader . params
# Attempt to use provided username and password or .netrc data
if downloader_params . get ( ' username ' , None ) is not None :
useremail = downloader_params [ ' username ' ]
password = downloader_params [ ' password ' ]
elif downloader_params . get ( ' usenetrc ' , False ) :
try :
info = netrc . netrc ( ) . authenticators ( self . _NETRC_MACHINE )
if info is not None :
useremail = info [ 0 ]
password = info [ 2 ]
else :
raise netrc . NetrcParseError ( ' No authenticators for %s ' % self . _NETRC_MACHINE )
except ( IOError , netrc . NetrcParseError ) as err :
2013-03-02 11:24:07 +01:00
self . _downloader . report_warning ( u ' parsing .netrc: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
if useremail is None :
return
# Log in
login_form = {
' email ' : useremail ,
' pass ' : password ,
' login ' : ' Log+In '
}
request = compat_urllib_request . Request ( self . _LOGIN_URL , compat_urllib_parse . urlencode ( login_form ) )
try :
self . report_login ( )
login_results = compat_urllib_request . urlopen ( request ) . read ( )
if re . search ( r ' <form(.*)name= " login " (.*)</form> ' , login_results ) is not None :
2013-03-02 11:24:07 +01:00
self . _downloader . report_warning ( u ' unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait. ' )
2012-11-28 02:04:46 +01:00
return
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-02 11:24:07 +01:00
self . _downloader . report_warning ( u ' unable to log in: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
video_id = mobj . group ( ' ID ' )
2013-01-25 16:54:48 +01:00
url = ' https://www.facebook.com/video/video.php?v= %s ' % video_id
webpage = self . _download_webpage ( url , video_id )
2013-04-18 04:41:48 +02:00
BEFORE = ' { swf.addParam(param[0], param[1]);}); \n '
2013-01-25 16:54:48 +01:00
AFTER = ' .forEach(function(variable) { swf.addVariable(variable[0], variable[1]);}); '
m = re . search ( re . escape ( BEFORE ) + ' (.*?) ' + re . escape ( AFTER ) , webpage )
if not m :
raise ExtractorError ( u ' Cannot parse data ' )
data = dict ( json . loads ( m . group ( 1 ) ) )
2013-02-01 17:56:22 +01:00
params_raw = compat_urllib_parse . unquote ( data [ ' params ' ] )
params = json . loads ( params_raw )
2013-04-18 04:41:48 +02:00
video_data = params [ ' video_data ' ] [ 0 ]
video_url = video_data . get ( ' hd_src ' )
2013-02-18 23:12:48 +01:00
if not video_url :
2013-04-18 04:41:48 +02:00
video_url = video_data [ ' sd_src ' ]
2013-02-18 23:12:48 +01:00
if not video_url :
raise ExtractorError ( u ' Cannot find video URL ' )
2013-04-18 04:41:48 +02:00
video_duration = int ( video_data [ ' video_duration ' ] )
thumbnail = video_data [ ' thumbnail_src ' ]
2013-01-25 16:54:48 +01:00
m = re . search ( ' <h2 class= " uiHeaderTitle " >([^<]+)</h2> ' , webpage )
if not m :
raise ExtractorError ( u ' Cannot find title in webpage ' )
video_title = unescapeHTML ( m . group ( 1 ) )
info = {
' id ' : video_id ,
' title ' : video_title ,
' url ' : video_url ,
' ext ' : ' mp4 ' ,
' duration ' : video_duration ,
2013-04-18 04:41:48 +02:00
' thumbnail ' : thumbnail ,
2013-01-25 16:54:48 +01:00
}
return [ info ]
2012-11-28 02:04:46 +01:00
2012-03-25 03:07:37 +02:00
class BlipTVIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for blip.tv """
_VALID_URL = r ' ^(?:https?://)?(?: \ w+ \ .)?blip \ .tv(/.+)$ '
_URL_EXT = r ' ^.* \ .([a-z0-9]+)$ '
IE_NAME = u ' blip.tv '
def report_direct_download ( self , title ) :
""" Report information extraction. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : Direct download detected ' % title )
2012-11-28 02:04:46 +01:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
2013-03-03 22:09:44 +01:00
urlp = compat_urllib_parse_urlparse ( url )
if urlp . path . startswith ( ' /play/ ' ) :
2013-03-01 11:22:16 +01:00
request = compat_urllib_request . Request ( url )
response = compat_urllib_request . urlopen ( request )
redirecturl = response . geturl ( )
2013-03-03 22:09:44 +01:00
rurlp = compat_urllib_parse_urlparse ( redirecturl )
file_id = compat_parse_qs ( rurlp . fragment ) [ ' file ' ] [ 0 ] . rpartition ( ' / ' ) [ 2 ]
url = ' http://blip.tv/a/a- ' + file_id
return self . _real_extract ( url )
2013-03-01 11:22:16 +01:00
2012-11-28 02:04:46 +01:00
if ' ? ' in url :
cchar = ' & '
else :
cchar = ' ? '
json_url = url + cchar + ' skin=json&version=2&no_wrap=1 '
2012-12-11 17:00:11 +01:00
request = compat_urllib_request . Request ( json_url )
2013-01-12 16:49:13 +01:00
request . add_header ( ' User-Agent ' , ' iTunes/10.6.1 ' )
2012-11-28 02:04:46 +01:00
self . report_extraction ( mobj . group ( 1 ) )
info = None
try :
urlh = compat_urllib_request . urlopen ( request )
if urlh . headers . get ( ' Content-Type ' , ' ' ) . startswith ( ' video/ ' ) : # Direct download
basename = url . split ( ' / ' ) [ - 1 ]
title , ext = os . path . splitext ( basename )
title = title . decode ( ' UTF-8 ' )
ext = ext . replace ( ' . ' , ' ' )
self . report_direct_download ( title )
info = {
' id ' : title ,
' url ' : url ,
' uploader ' : None ,
' upload_date ' : None ,
' title ' : title ,
' ext ' : ext ,
' urlhandle ' : urlh
}
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-01-12 16:49:13 +01:00
raise ExtractorError ( u ' ERROR: unable to download video info webpage: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
if info is None : # Regular URL
try :
2012-12-11 17:00:11 +01:00
json_code_bytes = urlh . read ( )
json_code = json_code_bytes . decode ( ' utf-8 ' )
2012-11-28 02:04:46 +01:00
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to read video info webpage: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
try :
json_data = json . loads ( json_code )
if ' Post ' in json_data :
data = json_data [ ' Post ' ]
else :
data = json_data
upload_date = datetime . datetime . strptime ( data [ ' datestamp ' ] , ' % m- %d - % y % H: % M % p ' ) . strftime ( ' % Y % m %d ' )
video_url = data [ ' media ' ] [ ' url ' ]
umobj = re . match ( self . _URL_EXT , video_url )
if umobj is None :
raise ValueError ( ' Can not determine filename extension ' )
ext = umobj . group ( 1 )
info = {
' id ' : data [ ' item_id ' ] ,
' url ' : video_url ,
' uploader ' : data [ ' display_name ' ] ,
' upload_date ' : upload_date ,
' title ' : data [ ' title ' ] ,
' ext ' : ext ,
' format ' : data [ ' media ' ] [ ' mimeType ' ] ,
' thumbnail ' : data [ ' thumbnailUrl ' ] ,
' description ' : data [ ' description ' ] ,
2013-01-12 16:49:13 +01:00
' player_url ' : data [ ' embedUrl ' ] ,
' user_agent ' : ' iTunes/10.6.1 ' ,
2012-11-28 02:04:46 +01:00
}
except ( ValueError , KeyError ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to parse video information: %s ' % repr ( err ) )
2012-11-28 02:04:46 +01:00
return
return [ info ]
2012-03-25 03:07:37 +02:00
class MyVideoIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information Extractor for myvideo.de. """
_VALID_URL = r ' (?:http://)?(?:www \ .)?myvideo \ .de/watch/([0-9]+)/([^?/]+).* '
IE_NAME = u ' myvideo '
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _download . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
video_id = mobj . group ( 1 )
# Get video webpage
2013-01-01 20:52:59 +01:00
webpage_url = ' http://www.myvideo.de/watch/ %s ' % video_id
webpage = self . _download_webpage ( webpage_url , video_id )
2012-11-28 02:04:46 +01:00
self . report_extraction ( video_id )
2013-03-29 15:59:13 +01:00
mobj = re . search ( r ' <link rel= \' image_src \' href= \' (http://is[0-9].myvideo \ .de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*? \ .jpg \' ' ,
2012-11-28 02:04:46 +01:00
webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract media URL ' )
2012-11-28 02:04:46 +01:00
return
video_url = mobj . group ( 1 ) + ( ' / %s .flv ' % video_id )
mobj = re . search ( ' <title>([^<]+)</title> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract title ' )
2012-11-28 02:04:46 +01:00
return
video_title = mobj . group ( 1 )
return [ {
' id ' : video_id ,
' url ' : video_url ,
' uploader ' : None ,
' upload_date ' : None ,
' title ' : video_title ,
' ext ' : u ' flv ' ,
} ]
2012-03-25 03:07:37 +02:00
class ComedyCentralIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for The Daily Show and Colbert Report """
2012-12-12 04:38:16 +01:00
# urls can be abbreviations like :thedailyshow or :colbert
2012-12-19 15:19:08 +01:00
# urls for episodes like:
2012-12-12 04:38:16 +01:00
# or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
# or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2012-12-19 15:19:08 +01:00
# or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2012-12-12 04:38:16 +01:00
_VALID_URL = r """ ^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
| ( https ? : / / ) ? ( www \. ) ?
( ? P < showname > thedailyshow | colbertnation ) \. com /
( full - episodes / ( ? P < episode > . * ) |
( ? P < clip >
( the - colbert - report - ( videos | collections ) / ( ? P < clipID > [ 0 - 9 ] + ) / [ ^ / ] * / ( ? P < cntitle > . * ? ) )
| ( watch / ( ? P < date > [ ^ / ] * ) / ( ? P < tdstitle > . * ) ) ) ) )
2012-12-19 15:19:08 +01:00
$ """
2012-11-28 02:04:46 +01:00
_available_formats = [ ' 3500 ' , ' 2200 ' , ' 1700 ' , ' 1200 ' , ' 750 ' , ' 400 ' ]
_video_extensions = {
' 3500 ' : ' mp4 ' ,
' 2200 ' : ' mp4 ' ,
' 1700 ' : ' mp4 ' ,
' 1200 ' : ' mp4 ' ,
' 750 ' : ' mp4 ' ,
' 400 ' : ' mp4 ' ,
}
_video_dimensions = {
' 3500 ' : ' 1280x720 ' ,
' 2200 ' : ' 960x540 ' ,
' 1700 ' : ' 768x432 ' ,
' 1200 ' : ' 640x360 ' ,
' 750 ' : ' 512x288 ' ,
' 400 ' : ' 384x216 ' ,
}
2013-02-26 19:02:31 +01:00
@classmethod
def suitable ( cls , url ) :
2012-12-12 04:38:16 +01:00
""" Receives a URL and returns True if suitable for this IE. """
2013-02-26 19:02:31 +01:00
return re . match ( cls . _VALID_URL , url , re . VERBOSE ) is not None
2012-12-12 04:38:16 +01:00
2012-11-28 02:04:46 +01:00
def _print_formats ( self , formats ) :
print ( ' Available formats: ' )
for x in formats :
print ( ' %s \t : \t %s \t [ %s ] ' % ( x , self . _video_extensions . get ( x , ' mp4 ' ) , self . _video_dimensions . get ( x , ' ??? ' ) ) )
def _real_extract ( self , url ) :
2012-12-12 04:38:16 +01:00
mobj = re . match ( self . _VALID_URL , url , re . VERBOSE )
2012-11-28 02:04:46 +01:00
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
if mobj . group ( ' shortname ' ) :
if mobj . group ( ' shortname ' ) in ( ' tds ' , ' thedailyshow ' ) :
url = u ' http://www.thedailyshow.com/full-episodes/ '
else :
url = u ' http://www.colbertnation.com/full-episodes/ '
2012-12-12 04:38:16 +01:00
mobj = re . match ( self . _VALID_URL , url , re . VERBOSE )
2012-11-28 02:04:46 +01:00
assert mobj is not None
2012-12-12 04:38:16 +01:00
if mobj . group ( ' clip ' ) :
if mobj . group ( ' showname ' ) == ' thedailyshow ' :
epTitle = mobj . group ( ' tdstitle ' )
else :
epTitle = mobj . group ( ' cntitle ' )
dlNewest = False
2012-11-28 02:04:46 +01:00
else :
2012-12-12 04:38:16 +01:00
dlNewest = not mobj . group ( ' episode ' )
if dlNewest :
epTitle = mobj . group ( ' showname ' )
else :
epTitle = mobj . group ( ' episode ' )
2012-11-28 02:04:46 +01:00
self . report_extraction ( epTitle )
2013-05-04 02:53:26 +02:00
webpage , htmlHandle = self . _download_webpage_handle ( url , epTitle )
2012-11-28 02:04:46 +01:00
if dlNewest :
url = htmlHandle . geturl ( )
2012-12-12 04:38:16 +01:00
mobj = re . match ( self . _VALID_URL , url , re . VERBOSE )
2012-11-28 02:04:46 +01:00
if mobj is None :
2013-05-04 02:53:26 +02:00
raise ExtractorError ( u ' Invalid redirected URL: ' + url )
2012-11-28 02:04:46 +01:00
if mobj . group ( ' episode ' ) == ' ' :
2013-05-04 02:53:26 +02:00
raise ExtractorError ( u ' Redirected URL is still not specific: ' + url )
2012-11-28 02:04:46 +01:00
epTitle = mobj . group ( ' episode ' )
2013-01-06 21:36:01 +01:00
mMovieParams = re . findall ( ' (?:<param name= " movie " value= " |var url = " )(http://media.mtvnservices.com/([^ " ]*(?:episode|video).*?:.*?)) " ' , webpage )
2012-11-28 02:04:46 +01:00
if len ( mMovieParams ) == 0 :
# The Colbert Report embeds the information in a without
# a URL prefix; so extract the alternate reference
# and then add the URL prefix manually.
2013-01-06 21:36:01 +01:00
altMovieParams = re . findall ( ' data-mgid= " ([^ " ]*(?:episode|video).*?:.*?) " ' , webpage )
2012-11-28 02:04:46 +01:00
if len ( altMovieParams ) == 0 :
2013-05-04 02:53:26 +02:00
raise ExtractorError ( u ' unable to find Flash URL in webpage ' + url )
2012-11-28 02:04:46 +01:00
else :
mMovieParams = [ ( " http://media.mtvnservices.com/ " + altMovieParams [ 0 ] , altMovieParams [ 0 ] ) ]
2012-12-19 15:19:08 +01:00
2012-11-28 02:04:46 +01:00
uri = mMovieParams [ 0 ] [ 1 ]
indexUrl = ' http://shadow.comedycentral.com/feeds/video_player/mrss/? ' + compat_urllib_parse . urlencode ( { ' uri ' : uri } )
2013-05-02 18:18:27 +02:00
indexXml = self . _download_webpage ( indexUrl , epTitle ,
u ' Downloading show index ' ,
u ' unable to download episode index ' )
2012-11-28 02:04:46 +01:00
results = [ ]
idoc = xml . etree . ElementTree . fromstring ( indexXml )
itemEls = idoc . findall ( ' .//item ' )
2013-01-06 21:35:20 +01:00
for partNum , itemEl in enumerate ( itemEls ) :
2012-11-28 02:04:46 +01:00
mediaId = itemEl . findall ( ' ./guid ' ) [ 0 ] . text
shortMediaId = mediaId . split ( ' : ' ) [ - 1 ]
showId = mediaId . split ( ' : ' ) [ - 2 ] . replace ( ' .com ' , ' ' )
officialTitle = itemEl . findall ( ' ./title ' ) [ 0 ] . text
2013-04-27 15:14:20 +02:00
officialDate = unified_strdate ( itemEl . findall ( ' ./pubDate ' ) [ 0 ] . text )
2012-11-28 02:04:46 +01:00
configUrl = ( ' http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml? ' +
compat_urllib_parse . urlencode ( { ' uri ' : mediaId } ) )
2013-05-02 18:18:27 +02:00
configXml = self . _download_webpage ( configUrl , epTitle ,
u ' Downloading configuration for %s ' % shortMediaId )
2012-11-28 02:04:46 +01:00
cdoc = xml . etree . ElementTree . fromstring ( configXml )
turls = [ ]
for rendition in cdoc . findall ( ' .//rendition ' ) :
finfo = ( rendition . attrib [ ' bitrate ' ] , rendition . findall ( ' ./src ' ) [ 0 ] . text )
turls . append ( finfo )
if len ( turls ) == 0 :
2013-03-05 21:13:17 +01:00
self . _downloader . report_error ( u ' unable to download ' + mediaId + ' : No videos found ' )
2012-11-28 02:04:46 +01:00
continue
2012-12-19 15:19:08 +01:00
2012-11-28 02:04:46 +01:00
if self . _downloader . params . get ( ' listformats ' , None ) :
self . _print_formats ( [ i [ 0 ] for i in turls ] )
return
# For now, just pick the highest bitrate
2013-01-06 21:26:31 +01:00
format , rtmp_video_url = turls [ - 1 ]
2012-11-28 02:04:46 +01:00
# Get the format arg from the arg stream
req_format = self . _downloader . params . get ( ' format ' , None )
# Select format if we can find one
for f , v in turls :
if f == req_format :
2013-01-06 21:26:31 +01:00
format , rtmp_video_url = f , v
2012-11-28 02:04:46 +01:00
break
2013-01-06 21:26:31 +01:00
m = re . match ( r ' ^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$ ' , rtmp_video_url )
if not m :
raise ExtractorError ( u ' Cannot transform RTMP url ' )
base = ' http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/ '
video_url = base + m . group ( ' finalid ' )
2012-11-28 02:04:46 +01:00
2013-01-06 21:35:20 +01:00
effTitle = showId + u ' - ' + epTitle + u ' part ' + compat_str ( partNum + 1 )
2012-11-28 02:04:46 +01:00
info = {
' id ' : shortMediaId ,
' url ' : video_url ,
' uploader ' : showId ,
' upload_date ' : officialDate ,
' title ' : effTitle ,
' ext ' : ' mp4 ' ,
' format ' : format ,
' thumbnail ' : None ,
' description ' : officialTitle ,
}
results . append ( info )
2012-12-19 15:19:08 +01:00
2012-11-28 02:04:46 +01:00
return results
2012-03-25 03:07:37 +02:00
class EscapistIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for The Escapist """
_VALID_URL = r ' ^(https?://)?(www \ .)?escapistmagazine \ .com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$ '
IE_NAME = u ' escapist '
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
showName = mobj . group ( ' showname ' )
videoId = mobj . group ( ' episode ' )
self . report_extraction ( showName )
2013-05-02 18:18:27 +02:00
webPage = self . _download_webpage ( url , showName )
2012-11-28 02:04:46 +01:00
descMatch = re . search ( ' <meta name= " description " content= " ([^ " ]*) " ' , webPage )
description = unescapeHTML ( descMatch . group ( 1 ) )
imgMatch = re . search ( ' <meta property= " og:image " content= " ([^ " ]*) " ' , webPage )
imgUrl = unescapeHTML ( imgMatch . group ( 1 ) )
playerUrlMatch = re . search ( ' <meta property= " og:video " content= " ([^ " ]*) " ' , webPage )
playerUrl = unescapeHTML ( playerUrlMatch . group ( 1 ) )
configUrlMatch = re . search ( ' config=(.*)$ ' , playerUrl )
configUrl = compat_urllib_parse . unquote ( configUrlMatch . group ( 1 ) )
2013-05-02 18:18:27 +02:00
configJSON = self . _download_webpage ( configUrl , showName ,
u ' Downloading configuration ' ,
u ' unable to download configuration ' )
2012-11-28 02:04:46 +01:00
# Technically, it's JavaScript, not JSON
configJSON = configJSON . replace ( " ' " , ' " ' )
try :
config = json . loads ( configJSON )
except ( ValueError , ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Invalid JSON in configuration file: ' + compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
playlist = config [ ' playlist ' ]
videoUrl = playlist [ 1 ] [ ' url ' ]
info = {
' id ' : videoId ,
' url ' : videoUrl ,
' uploader ' : showName ,
' upload_date ' : None ,
' title ' : showName ,
2013-03-06 18:46:45 +01:00
' ext ' : ' mp4 ' ,
2012-11-28 02:04:46 +01:00
' thumbnail ' : imgUrl ,
' description ' : description ,
' player_url ' : playerUrl ,
}
return [ info ]
2012-03-25 03:07:37 +02:00
class CollegeHumorIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for collegehumor.com """
2012-11-28 11:43:35 +01:00
_WORKING = False
2012-11-28 02:04:46 +01:00
_VALID_URL = r ' ^(?:https?://)?(?:www \ .)?collegehumor \ .com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$ '
IE_NAME = u ' collegehumor '
2012-11-28 04:51:27 +01:00
def report_manifest ( self , video_id ) :
2012-11-28 02:04:46 +01:00
""" Report information extraction. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : Downloading XML manifest ' % video_id )
2012-11-28 02:04:46 +01:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
video_id = mobj . group ( ' videoid ' )
info = {
' id ' : video_id ,
' uploader ' : None ,
' upload_date ' : None ,
}
self . report_extraction ( video_id )
2012-11-28 04:51:27 +01:00
xmlUrl = ' http://www.collegehumor.com/moogaloop/video/ ' + video_id
2012-11-28 02:04:46 +01:00
try :
metaXml = compat_urllib_request . urlopen ( xmlUrl ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to download video info XML: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
mdoc = xml . etree . ElementTree . fromstring ( metaXml )
try :
videoNode = mdoc . findall ( ' ./video ' ) [ 0 ]
info [ ' description ' ] = videoNode . findall ( ' ./description ' ) [ 0 ] . text
info [ ' title ' ] = videoNode . findall ( ' ./caption ' ) [ 0 ] . text
info [ ' thumbnail ' ] = videoNode . findall ( ' ./thumbnail ' ) [ 0 ] . text
2012-11-28 04:51:27 +01:00
manifest_url = videoNode . findall ( ' ./file ' ) [ 0 ] . text
2012-11-28 02:04:46 +01:00
except IndexError :
2013-03-05 21:13:17 +01:00
self . _downloader . report_error ( u ' Invalid metadata XML file ' )
2012-11-28 02:04:46 +01:00
return
2012-11-28 04:51:27 +01:00
manifest_url + = ' ?hdcore=2.10.3 '
self . report_manifest ( video_id )
try :
manifestXml = compat_urllib_request . urlopen ( manifest_url ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to download video info XML: %s ' % compat_str ( err ) )
2012-11-28 04:51:27 +01:00
return
adoc = xml . etree . ElementTree . fromstring ( manifestXml )
try :
media_node = adoc . findall ( ' ./ { http://ns.adobe.com/f4m/1.0}media ' ) [ 0 ]
node_id = media_node . attrib [ ' url ' ]
video_id = adoc . findall ( ' ./ { http://ns.adobe.com/f4m/1.0}id ' ) [ 0 ] . text
except IndexError as err :
2013-03-05 21:13:17 +01:00
self . _downloader . report_error ( u ' Invalid manifest file ' )
2012-11-28 04:51:27 +01:00
return
url_pr = compat_urllib_parse_urlparse ( manifest_url )
url = url_pr . scheme + ' :// ' + url_pr . netloc + ' /z ' + video_id [ : - 2 ] + ' / ' + node_id + ' Seg1-Frag1 '
info [ ' url ' ] = url
info [ ' ext ' ] = ' f4f '
2012-11-28 02:04:46 +01:00
return [ info ]
2012-03-25 03:07:37 +02:00
class XVideosIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for xvideos.com """
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
_VALID_URL = r ' ^(?:https?://)?(?:www \ .)?xvideos \ .com/video([0-9]+)(?:.*) '
IE_NAME = u ' xvideos '
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
2012-12-15 17:50:45 +01:00
video_id = mobj . group ( 1 )
2012-03-25 03:07:37 +02:00
2013-01-01 20:52:59 +01:00
webpage = self . _download_webpage ( url , video_id )
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
self . report_extraction ( video_id )
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
# Extract video URL
mobj = re . search ( r ' flv_url=(.+?)& ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video url ' )
2012-11-28 02:04:46 +01:00
return
2012-12-15 17:50:45 +01:00
video_url = compat_urllib_parse . unquote ( mobj . group ( 1 ) )
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
# Extract title
mobj = re . search ( r ' <title>(.*?) \ s+- \ s+XVID ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video title ' )
2012-11-28 02:04:46 +01:00
return
2012-12-15 17:50:45 +01:00
video_title = mobj . group ( 1 )
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
# Extract video thumbnail
mobj = re . search ( r ' http://(?:img.*? \ .)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg) ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video thumbnail ' )
2012-11-28 02:04:46 +01:00
return
2012-12-15 17:50:45 +01:00
video_thumbnail = mobj . group ( 0 )
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
info = {
' id ' : video_id ,
' url ' : video_url ,
' uploader ' : None ,
' upload_date ' : None ,
' title ' : video_title ,
' ext ' : ' flv ' ,
' thumbnail ' : video_thumbnail ,
' description ' : None ,
}
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
return [ info ]
2012-03-25 03:07:37 +02:00
class SoundcloudIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for soundcloud.com
To access the media , the uid of the song and a stream token
must be extracted from the page source and the script must make
a request to media . soundcloud . com / crossdomain . xml . Then
the media can be grabbed by requesting from an url composed
of the stream token and uid
"""
_VALID_URL = r ' ^(?:https?://)?(?:www \ .)?soundcloud \ .com/([ \ w \ d-]+)/([ \ w \ d-]+) '
IE_NAME = u ' soundcloud '
2012-12-07 01:24:51 +01:00
def report_resolve ( self , video_id ) :
2012-11-28 02:04:46 +01:00
""" Report information extraction. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : Resolving id ' % video_id )
2012-11-28 02:04:46 +01:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
# extract uploader (which is in the url)
2012-11-29 20:40:12 +01:00
uploader = mobj . group ( 1 )
2012-11-28 02:04:46 +01:00
# extract simple title (uploader + slug of song title)
2012-11-29 20:40:12 +01:00
slug_title = mobj . group ( 2 )
2012-11-28 02:04:46 +01:00
simple_title = uploader + u ' - ' + slug_title
2013-05-02 18:18:27 +02:00
full_title = ' %s / %s ' % ( uploader , slug_title )
2012-11-28 02:04:46 +01:00
2013-05-02 18:18:27 +02:00
self . report_resolve ( full_title )
2012-11-28 02:04:46 +01:00
2012-12-07 01:24:51 +01:00
url = ' http://soundcloud.com/ %s / %s ' % ( uploader , slug_title )
resolv_url = ' http://api.soundcloud.com/resolve.json?url= ' + url + ' &client_id=b45b1aa10f1ac2941910a7f0d10f8e28 '
2013-05-02 18:18:27 +02:00
info_json = self . _download_webpage ( resolv_url , full_title , u ' Downloading info JSON ' )
2012-11-28 02:04:46 +01:00
2012-12-07 01:24:51 +01:00
info = json . loads ( info_json )
video_id = info [ ' id ' ]
2013-05-02 18:18:27 +02:00
self . report_extraction ( full_title )
2012-11-28 02:04:46 +01:00
2012-12-07 01:24:51 +01:00
streams_url = ' https://api.sndcdn.com/i1/tracks/ ' + str ( video_id ) + ' /streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28 '
2013-05-02 18:18:27 +02:00
stream_json = self . _download_webpage ( streams_url , full_title ,
u ' Downloading stream definitions ' ,
u ' unable to download stream definitions ' )
2012-11-28 02:04:46 +01:00
2012-12-07 01:24:51 +01:00
streams = json . loads ( stream_json )
2012-12-07 01:30:03 +01:00
mediaURL = streams [ ' http_mp3_128_url ' ]
2013-04-27 15:14:20 +02:00
upload_date = unified_strdate ( info [ ' created_at ' ] )
2012-11-28 02:04:46 +01:00
return [ {
2012-12-07 01:30:03 +01:00
' id ' : info [ ' id ' ] ,
2012-11-28 02:04:46 +01:00
' url ' : mediaURL ,
2012-12-07 01:30:03 +01:00
' uploader ' : info [ ' user ' ] [ ' username ' ] ,
2013-04-27 15:14:20 +02:00
' upload_date ' : upload_date ,
2012-12-07 01:30:03 +01:00
' title ' : info [ ' title ' ] ,
2012-11-28 02:04:46 +01:00
' ext ' : u ' mp3 ' ,
2012-12-07 01:30:03 +01:00
' description ' : info [ ' description ' ] ,
2012-11-28 02:04:46 +01:00
} ]
2012-03-25 03:07:37 +02:00
2013-03-24 02:24:07 +01:00
class SoundcloudSetIE ( InfoExtractor ) :
""" Information extractor for soundcloud.com sets
To access the media , the uid of the song and a stream token
must be extracted from the page source and the script must make
a request to media . soundcloud . com / crossdomain . xml . Then
the media can be grabbed by requesting from an url composed
of the stream token and uid
"""
_VALID_URL = r ' ^(?:https?://)?(?:www \ .)?soundcloud \ .com/([ \ w \ d-]+)/sets/([ \ w \ d-]+) '
2013-04-27 20:12:46 +02:00
IE_NAME = u ' soundcloud:set '
2013-03-24 02:24:07 +01:00
def report_resolve ( self , video_id ) :
""" Report information extraction. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : Resolving id ' % video_id )
2013-03-24 02:24:07 +01:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-04-23 11:31:37 +02:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2013-03-24 02:24:07 +01:00
return
# extract uploader (which is in the url)
uploader = mobj . group ( 1 )
# extract simple title (uploader + slug of song title)
slug_title = mobj . group ( 2 )
simple_title = uploader + u ' - ' + slug_title
2013-05-02 18:18:27 +02:00
full_title = ' %s /sets/ %s ' % ( uploader , slug_title )
2013-03-24 02:24:07 +01:00
2013-05-02 18:18:27 +02:00
self . report_resolve ( full_title )
2013-03-24 02:24:07 +01:00
url = ' http://soundcloud.com/ %s /sets/ %s ' % ( uploader , slug_title )
resolv_url = ' http://api.soundcloud.com/resolve.json?url= ' + url + ' &client_id=b45b1aa10f1ac2941910a7f0d10f8e28 '
2013-05-02 18:18:27 +02:00
info_json = self . _download_webpage ( resolv_url , full_title )
2013-03-24 02:24:07 +01:00
videos = [ ]
info = json . loads ( info_json )
if ' errors ' in info :
for err in info [ ' errors ' ] :
2013-04-23 11:31:37 +02:00
self . _downloader . report_error ( u ' unable to download video webpage: %s ' % compat_str ( err [ ' error_message ' ] ) )
2013-03-24 02:24:07 +01:00
return
2013-05-02 18:18:27 +02:00
self . report_extraction ( full_title )
2013-03-24 02:24:07 +01:00
for track in info [ ' tracks ' ] :
video_id = track [ ' id ' ]
streams_url = ' https://api.sndcdn.com/i1/tracks/ ' + str ( video_id ) + ' /streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28 '
2013-05-02 18:18:27 +02:00
stream_json = self . _download_webpage ( streams_url , video_id , u ' Downloading track info JSON ' )
2013-03-24 02:24:07 +01:00
2013-05-02 18:18:27 +02:00
self . report_extraction ( video_id )
2013-03-24 02:24:07 +01:00
streams = json . loads ( stream_json )
mediaURL = streams [ ' http_mp3_128_url ' ]
videos . append ( {
' id ' : video_id ,
' url ' : mediaURL ,
' uploader ' : track [ ' user ' ] [ ' username ' ] ,
2013-04-29 23:57:36 +02:00
' upload_date ' : unified_strdate ( track [ ' created_at ' ] ) ,
2013-03-24 02:24:07 +01:00
' title ' : track [ ' title ' ] ,
' ext ' : u ' mp3 ' ,
' description ' : track [ ' description ' ] ,
} )
return videos
2012-03-25 03:07:37 +02:00
class InfoQIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for infoq.com """
_VALID_URL = r ' ^(?:https?://)?(?:www \ .)?infoq \ .com/[^/]+/[^/]+$ '
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
2013-01-01 21:07:37 +01:00
webpage = self . _download_webpage ( url , video_id = url )
2012-11-28 02:04:46 +01:00
self . report_extraction ( url )
# Extract video URL
2013-04-24 21:16:10 +02:00
mobj = re . search ( r " jsclassref ?= ? ' ([^ ' ]*) ' " , webpage )
2012-11-28 02:04:46 +01:00
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video url ' )
2012-11-28 02:04:46 +01:00
return
2013-01-01 21:07:37 +01:00
real_id = compat_urllib_parse . unquote ( base64 . b64decode ( mobj . group ( 1 ) . encode ( ' ascii ' ) ) . decode ( ' utf-8 ' ) )
video_url = ' rtmpe://video.infoq.com/cfx/st/ ' + real_id
2012-11-28 02:04:46 +01:00
# Extract title
mobj = re . search ( r ' contentTitle = " (.*?) " ; ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video title ' )
2012-11-28 02:04:46 +01:00
return
2013-01-01 21:07:37 +01:00
video_title = mobj . group ( 1 )
2012-11-28 02:04:46 +01:00
# Extract description
video_description = u ' No description available. '
mobj = re . search ( r ' <meta name= " description " content= " (.*) " (?: \ s*/)?> ' , webpage )
if mobj is not None :
2013-01-01 21:07:37 +01:00
video_description = mobj . group ( 1 )
2012-11-28 02:04:46 +01:00
video_filename = video_url . split ( ' / ' ) [ - 1 ]
video_id , extension = video_filename . split ( ' . ' )
info = {
' id ' : video_id ,
' url ' : video_url ,
' uploader ' : None ,
' upload_date ' : None ,
' title ' : video_title ,
' ext ' : extension , # Extension is always(?) mp4, but seems to be flv
' thumbnail ' : None ,
' description ' : video_description ,
}
return [ info ]
2012-03-25 03:07:37 +02:00
class MixcloudIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for www.mixcloud.com """
2012-12-17 18:33:11 +01:00
_WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2012-11-28 02:04:46 +01:00
_VALID_URL = r ' ^(?:https?://)?(?:www \ .)?mixcloud \ .com/([ \ w \ d-]+)/([ \ w \ d-]+) '
IE_NAME = u ' mixcloud '
def report_download_json ( self , file_id ) :
""" Report JSON download. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Downloading json ' )
2012-11-28 02:04:46 +01:00
def get_urls ( self , jsonData , fmt , bitrate = ' best ' ) :
""" Get urls from ' audio_formats ' section in json """
file_url = None
try :
bitrate_list = jsonData [ fmt ]
if bitrate is None or bitrate == ' best ' or bitrate not in bitrate_list :
bitrate = max ( bitrate_list ) # select highest
url_list = jsonData [ fmt ] [ bitrate ]
except TypeError : # we have no bitrate info.
url_list = jsonData [ fmt ]
return url_list
def check_urls ( self , url_list ) :
""" Returns 1st active url from list """
for url in url_list :
try :
compat_urllib_request . urlopen ( url )
return url
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
url = None
return None
def _print_formats ( self , formats ) :
print ( ' Available formats: ' )
for fmt in formats . keys ( ) :
for b in formats [ fmt ] :
try :
ext = formats [ fmt ] [ b ] [ 0 ]
print ( ' %s \t %s \t [ %s ] ' % ( fmt , b , ext . split ( ' . ' ) [ - 1 ] ) )
except TypeError : # we have no bitrate info
ext = formats [ fmt ] [ 0 ]
print ( ' %s \t %s \t [ %s ] ' % ( fmt , ' ?? ' , ext . split ( ' . ' ) [ - 1 ] ) )
break
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
# extract uploader & filename from url
uploader = mobj . group ( 1 ) . decode ( ' utf-8 ' )
file_id = uploader + " - " + mobj . group ( 2 ) . decode ( ' utf-8 ' )
# construct API request
file_url = ' http://www.mixcloud.com/api/1/cloudcast/ ' + ' / ' . join ( url . split ( ' / ' ) [ - 3 : - 1 ] ) + ' .json '
# retrieve .json file with links to files
request = compat_urllib_request . Request ( file_url )
try :
self . report_download_json ( file_url )
jsonData = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Unable to retrieve file: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
# parse JSON
json_data = json . loads ( jsonData )
player_url = json_data [ ' player_swf_url ' ]
formats = dict ( json_data [ ' audio_formats ' ] )
req_format = self . _downloader . params . get ( ' format ' , None )
bitrate = None
if self . _downloader . params . get ( ' listformats ' , None ) :
self . _print_formats ( formats )
return
if req_format is None or req_format == ' best ' :
for format_param in formats . keys ( ) :
url_list = self . get_urls ( formats , format_param )
# check urls
file_url = self . check_urls ( url_list )
if file_url is not None :
break # got it!
else :
2012-12-26 20:39:33 +01:00
if req_format not in formats :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' format is not available ' )
2012-11-28 02:04:46 +01:00
return
url_list = self . get_urls ( formats , req_format )
file_url = self . check_urls ( url_list )
format_param = req_format
return [ {
' id ' : file_id . decode ( ' utf-8 ' ) ,
' url ' : file_url . decode ( ' utf-8 ' ) ,
' uploader ' : uploader . decode ( ' utf-8 ' ) ,
' upload_date ' : None ,
' title ' : json_data [ ' name ' ] ,
' ext ' : file_url . split ( ' . ' ) [ - 1 ] . decode ( ' utf-8 ' ) ,
' format ' : ( format_param is None and u ' NA ' or format_param . decode ( ' utf-8 ' ) ) ,
' thumbnail ' : json_data [ ' thumbnail_url ' ] ,
' description ' : json_data [ ' description ' ] ,
' player_url ' : player_url . decode ( ' utf-8 ' ) ,
} ]
2012-03-25 03:07:37 +02:00
class StanfordOpenClassroomIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for Stanford ' s Open ClassRoom """
_VALID_URL = r ' ^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage) \ .php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$ '
IE_NAME = u ' stanfordoc '
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-01-27 15:23:26 +01:00
raise ExtractorError ( u ' Invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
if mobj . group ( ' course ' ) and mobj . group ( ' video ' ) : # A specific video
course = mobj . group ( ' course ' )
video = mobj . group ( ' video ' )
info = {
' id ' : course + ' _ ' + video ,
' uploader ' : None ,
' upload_date ' : None ,
}
self . report_extraction ( info [ ' id ' ] )
baseUrl = ' http://openclassroom.stanford.edu/MainFolder/courses/ ' + course + ' /videos/ '
xmlUrl = baseUrl + video + ' .xml '
try :
metaXml = compat_urllib_request . urlopen ( xmlUrl ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to download video info XML: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
mdoc = xml . etree . ElementTree . fromstring ( metaXml )
try :
info [ ' title ' ] = mdoc . findall ( ' ./title ' ) [ 0 ] . text
info [ ' url ' ] = baseUrl + mdoc . findall ( ' ./videoFile ' ) [ 0 ] . text
except IndexError :
2013-03-05 21:13:17 +01:00
self . _downloader . report_error ( u ' Invalid metadata XML file ' )
2012-11-28 02:04:46 +01:00
return
info [ ' ext ' ] = info [ ' url ' ] . rpartition ( ' . ' ) [ 2 ]
return [ info ]
elif mobj . group ( ' course ' ) : # A course page
course = mobj . group ( ' course ' )
info = {
' id ' : course ,
' type ' : ' playlist ' ,
' uploader ' : None ,
' upload_date ' : None ,
}
2013-01-27 15:23:26 +01:00
coursepage = self . _download_webpage ( url , info [ ' id ' ] ,
note = ' Downloading course info page ' ,
errnote = ' Unable to download course info page ' )
2012-11-28 02:04:46 +01:00
m = re . search ( ' <h1>([^<]+)</h1> ' , coursepage )
if m :
info [ ' title ' ] = unescapeHTML ( m . group ( 1 ) )
else :
info [ ' title ' ] = info [ ' id ' ]
m = re . search ( ' <description>([^<]+)</description> ' , coursepage )
if m :
info [ ' description ' ] = unescapeHTML ( m . group ( 1 ) )
links = orderedSet ( re . findall ( ' <a href= " (VideoPage.php \ ?[^ " ]+) " > ' , coursepage ) )
info [ ' list ' ] = [
{
' type ' : ' reference ' ,
' url ' : ' http://openclassroom.stanford.edu/MainFolder/ ' + unescapeHTML ( vpage ) ,
}
for vpage in links ]
results = [ ]
for entry in info [ ' list ' ] :
assert entry [ ' type ' ] == ' reference '
results + = self . extract ( entry [ ' url ' ] )
return results
else : # Root page
info = {
' id ' : ' Stanford OpenClassroom ' ,
' type ' : ' playlist ' ,
' uploader ' : None ,
' upload_date ' : None ,
}
self . report_download_webpage ( info [ ' id ' ] )
rootURL = ' http://openclassroom.stanford.edu/MainFolder/HomePage.php '
try :
rootpage = compat_urllib_request . urlopen ( rootURL ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to download course info page: ' + compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
info [ ' title ' ] = info [ ' id ' ]
links = orderedSet ( re . findall ( ' <a href= " (CoursePage.php \ ?[^ " ]+) " > ' , rootpage ) )
info [ ' list ' ] = [
{
' type ' : ' reference ' ,
' url ' : ' http://openclassroom.stanford.edu/MainFolder/ ' + unescapeHTML ( cpage ) ,
}
for cpage in links ]
results = [ ]
for entry in info [ ' list ' ] :
assert entry [ ' type ' ] == ' reference '
results + = self . extract ( entry [ ' url ' ] )
return results
2012-03-25 03:07:37 +02:00
class MTVIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for MTV.com """
_VALID_URL = r ' ^(?P<proto>https?://)?(?:www \ .)?mtv \ .com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$ '
IE_NAME = u ' mtv '
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
if not mobj . group ( ' proto ' ) :
url = ' http:// ' + url
video_id = mobj . group ( ' videoid ' )
2013-01-01 20:52:59 +01:00
webpage = self . _download_webpage ( url , video_id )
2012-11-28 02:04:46 +01:00
mobj = re . search ( r ' <meta name= " mtv_vt " content= " ([^ " ]+) " /> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract song name ' )
2012-11-28 02:04:46 +01:00
return
song_name = unescapeHTML ( mobj . group ( 1 ) . decode ( ' iso-8859-1 ' ) )
mobj = re . search ( r ' <meta name= " mtv_an " content= " ([^ " ]+) " /> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract performer ' )
2012-11-28 02:04:46 +01:00
return
performer = unescapeHTML ( mobj . group ( 1 ) . decode ( ' iso-8859-1 ' ) )
2012-12-19 15:19:08 +01:00
video_title = performer + ' - ' + song_name
2012-11-28 02:04:46 +01:00
mobj = re . search ( r ' <meta name= " mtvn_uri " content= " ([^ " ]+) " /> ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to mtvn_uri ' )
2012-11-28 02:04:46 +01:00
return
mtvn_uri = mobj . group ( 1 )
mobj = re . search ( r ' MTVN.Player.defaultPlaylistId = ([0-9]+); ' , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract content id ' )
2012-11-28 02:04:46 +01:00
return
content_id = mobj . group ( 1 )
videogen_url = ' http://www.mtv.com/player/includes/mediaGen.jhtml?uri= ' + mtvn_uri + ' &id= ' + content_id + ' &vid= ' + video_id + ' &ref=www.mtvn.com&viewUri= ' + mtvn_uri
self . report_extraction ( video_id )
request = compat_urllib_request . Request ( videogen_url )
try :
metadataXml = compat_urllib_request . urlopen ( request ) . read ( )
except ( compat_urllib_error . URLError , compat_http_client . HTTPException , socket . error ) as err :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to download video metadata: %s ' % compat_str ( err ) )
2012-11-28 02:04:46 +01:00
return
mdoc = xml . etree . ElementTree . fromstring ( metadataXml )
renditions = mdoc . findall ( ' .//rendition ' )
# For now, always pick the highest quality.
rendition = renditions [ - 1 ]
try :
_ , _ , ext = rendition . attrib [ ' type ' ] . partition ( ' / ' )
format = ext + ' - ' + rendition . attrib [ ' width ' ] + ' x ' + rendition . attrib [ ' height ' ] + ' _ ' + rendition . attrib [ ' bitrate ' ]
video_url = rendition . find ( ' ./src ' ) . text
except KeyError :
2013-04-23 11:31:37 +02:00
self . _downloader . report_error ( ' Invalid rendition field. ' )
2012-11-28 02:04:46 +01:00
return
info = {
' id ' : video_id ,
' url ' : video_url ,
' uploader ' : performer ,
' upload_date ' : None ,
' title ' : video_title ,
' ext ' : ext ,
' format ' : format ,
}
return [ info ]
2012-08-16 01:54:03 +02:00
2012-08-08 20:04:02 +02:00
class YoukuIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
_VALID_URL = r ' (?:http://)?v \ .youku \ .com/v_show/id_(?P<ID>[A-Za-z0-9]+) \ .html '
def _gen_sid ( self ) :
nowTime = int ( time . time ( ) * 1000 )
random1 = random . randint ( 1000 , 1998 )
random2 = random . randint ( 1000 , 9999 )
return " %d %d %d " % ( nowTime , random1 , random2 )
def _get_file_ID_mix_string ( self , seed ) :
mixed = [ ]
source = list ( " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/ \ :._-1234567890 " )
seed = float ( seed )
for i in range ( len ( source ) ) :
seed = ( seed * 211 + 30031 ) % 65536
index = math . floor ( seed / 65536 * len ( source ) )
mixed . append ( source [ int ( index ) ] )
source . remove ( source [ int ( index ) ] )
#return ''.join(mixed)
return mixed
def _get_file_id ( self , fileId , seed ) :
mixed = self . _get_file_ID_mix_string ( seed )
ids = fileId . split ( ' * ' )
realId = [ ]
for ch in ids :
if ch :
realId . append ( mixed [ int ( ch ) ] )
return ' ' . join ( realId )
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
video_id = mobj . group ( ' ID ' )
info_url = ' http://v.youku.com/player/getPlayList/VideoIDS/ ' + video_id
2013-05-02 18:18:27 +02:00
jsondata = self . _download_webpage ( info_url , video_id )
2012-11-28 02:04:46 +01:00
self . report_extraction ( video_id )
try :
2013-05-02 18:18:27 +02:00
config = json . loads ( jsondata )
2012-11-28 02:04:46 +01:00
video_title = config [ ' data ' ] [ 0 ] [ ' title ' ]
seed = config [ ' data ' ] [ 0 ] [ ' seed ' ]
format = self . _downloader . params . get ( ' format ' , None )
2012-12-20 14:18:23 +01:00
supported_format = list ( config [ ' data ' ] [ 0 ] [ ' streamfileids ' ] . keys ( ) )
2012-11-28 02:04:46 +01:00
if format is None or format == ' best ' :
if ' hd2 ' in supported_format :
format = ' hd2 '
else :
format = ' flv '
ext = u ' flv '
elif format == ' worst ' :
format = ' mp4 '
ext = u ' mp4 '
else :
format = ' flv '
ext = u ' flv '
fileid = config [ ' data ' ] [ 0 ] [ ' streamfileids ' ] [ format ]
2012-12-15 17:57:13 +01:00
keys = [ s [ ' k ' ] for s in config [ ' data ' ] [ 0 ] [ ' segs ' ] [ format ] ]
2012-12-15 17:59:09 +01:00
except ( UnicodeDecodeError , ValueError , KeyError ) :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract info section ' )
2012-11-28 02:04:46 +01:00
return
files_info = [ ]
sid = self . _gen_sid ( )
fileid = self . _get_file_id ( fileid , seed )
#column 8,9 of fileid represent the segment number
#fileid[7:9] should be changed
for index , key in enumerate ( keys ) :
temp_fileid = ' %s %02X %s ' % ( fileid [ 0 : 8 ] , index , fileid [ 10 : ] )
download_url = ' http://f.youku.com/player/getFlvPath/sid/ %s _ %02X /st/flv/fileid/ %s ?k= %s ' % ( sid , index , temp_fileid , key )
info = {
' id ' : ' %s _part %02d ' % ( video_id , index ) ,
' url ' : download_url ,
' uploader ' : None ,
' upload_date ' : None ,
' title ' : video_title ,
' ext ' : ext ,
}
files_info . append ( info )
return files_info
2012-08-19 18:39:43 +02:00
2012-08-16 01:54:03 +02:00
class XNXXIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for xnxx.com """
2013-01-05 22:05:23 +01:00
_VALID_URL = r ' ^(?:https?://)?video \ .xnxx \ .com/video([0-9]+)/(.*) '
2012-11-28 02:04:46 +01:00
IE_NAME = u ' xnxx '
VIDEO_URL_RE = r ' flv_url=(.*?)& '
VIDEO_TITLE_RE = r ' <title>(.*?) \ s+- \ s+XNXX.COM '
VIDEO_THUMB_RE = r ' url_bigthumb=(.*?)& '
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
2012-12-15 18:19:25 +01:00
video_id = mobj . group ( 1 )
2012-11-28 02:04:46 +01:00
# Get webpage content
2013-05-02 18:18:27 +02:00
webpage = self . _download_webpage ( url , video_id )
2012-11-28 02:04:46 +01:00
result = re . search ( self . VIDEO_URL_RE , webpage )
if result is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video url ' )
2012-11-28 02:04:46 +01:00
return
2012-12-15 18:19:25 +01:00
video_url = compat_urllib_parse . unquote ( result . group ( 1 ) )
2012-11-28 02:04:46 +01:00
result = re . search ( self . VIDEO_TITLE_RE , webpage )
if result is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video title ' )
2012-11-28 02:04:46 +01:00
return
2012-12-15 18:19:25 +01:00
video_title = result . group ( 1 )
2012-11-28 02:04:46 +01:00
result = re . search ( self . VIDEO_THUMB_RE , webpage )
if result is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video thumbnail ' )
2012-11-28 02:04:46 +01:00
return
2012-12-15 18:19:25 +01:00
video_thumbnail = result . group ( 1 )
2012-11-28 02:04:46 +01:00
return [ {
' id ' : video_id ,
' url ' : video_url ,
' uploader ' : None ,
' upload_date ' : None ,
' title ' : video_title ,
' ext ' : ' flv ' ,
' thumbnail ' : video_thumbnail ,
' description ' : None ,
} ]
2012-10-09 10:48:49 +02:00
2012-09-25 10:21:02 +02:00
class GooglePlusIE ( InfoExtractor ) :
2012-11-28 02:04:46 +01:00
""" Information extractor for plus.google.com. """
2012-12-17 18:33:11 +01:00
_VALID_URL = r ' (?:https://)?plus \ .google \ .com/(?:[^/]+/)*?posts/( \ w+) '
2012-11-28 02:04:46 +01:00
IE_NAME = u ' plus.google '
def report_extract_entry ( self , url ) :
""" Report downloading extry """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Downloading entry: %s ' % url )
2012-11-28 02:04:46 +01:00
def report_date ( self , upload_date ) :
""" Report downloading extry """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Entry date: %s ' % upload_date )
2012-11-28 02:04:46 +01:00
def report_uploader ( self , uploader ) :
""" Report downloading extry """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Uploader: %s ' % uploader )
2012-11-28 02:04:46 +01:00
def report_title ( self , video_title ) :
""" Report downloading extry """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Title: %s ' % video_title )
2012-11-28 02:04:46 +01:00
def report_extract_vid_page ( self , video_page ) :
""" Report information extraction. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Extracting video page: %s ' % video_page )
2012-11-28 02:04:46 +01:00
def _real_extract ( self , url ) :
# Extract id from URL
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Invalid URL: %s ' % url )
2012-11-28 02:04:46 +01:00
return
post_url = mobj . group ( 0 )
2012-12-17 18:33:11 +01:00
video_id = mobj . group ( 1 )
2012-11-28 02:04:46 +01:00
video_extension = ' flv '
# Step 1, Retrieve post webpage to extract further information
self . report_extract_entry ( post_url )
2013-05-02 18:18:27 +02:00
webpage = self . _download_webpage ( post_url , video_id , u ' Downloading entry webpage ' )
2012-11-28 02:04:46 +01:00
# Extract update date
upload_date = None
pattern = ' title= " Timestamp " >(.*?)</a> '
mobj = re . search ( pattern , webpage )
if mobj :
upload_date = mobj . group ( 1 )
# Convert timestring to a format suitable for filename
upload_date = datetime . datetime . strptime ( upload_date , " % Y- % m- %d " )
upload_date = upload_date . strftime ( ' % Y % m %d ' )
self . report_date ( upload_date )
# Extract uploader
uploader = None
pattern = r ' rel \ = " author " .*?>(.*?)</a> '
mobj = re . search ( pattern , webpage )
if mobj :
uploader = mobj . group ( 1 )
self . report_uploader ( uploader )
# Extract title
# Get the first line for title
video_title = u ' NA '
pattern = r ' <meta name \ = \ " Description \ " content \ = \ " (.*?)[ \ n< " ] '
mobj = re . search ( pattern , webpage )
if mobj :
video_title = mobj . group ( 1 )
self . report_title ( video_title )
# Step 2, Stimulate clicking the image box to launch video
pattern = ' " (https \ ://plus \ .google \ .com/photos/.*?) " ,, " image/jpeg " , " video " \ ] '
mobj = re . search ( pattern , webpage )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video page URL ' )
2012-11-28 02:04:46 +01:00
video_page = mobj . group ( 1 )
2013-05-02 18:18:27 +02:00
webpage = self . _download_webpage ( video_page , video_id , u ' Downloading video page ' )
2012-11-28 02:04:46 +01:00
self . report_extract_vid_page ( video_page )
# Extract video links on video page
""" Extract video links of all sizes """
pattern = ' \ d+, \ d+,( \ d+), " (http \ ://redirector \ .googlevideo \ .com.*?) " '
mobj = re . findall ( pattern , webpage )
if len ( mobj ) == 0 :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video links ' )
2012-11-28 02:04:46 +01:00
# Sort in resolution
links = sorted ( mobj )
# Choose the lowest of the sort, i.e. highest resolution
video_url = links [ - 1 ]
# Only get the url. The resolution part in the tuple has no use anymore
video_url = video_url [ - 1 ]
# Treat escaped \u0026 style hex
2012-12-17 18:33:11 +01:00
try :
video_url = video_url . decode ( " unicode_escape " )
except AttributeError : # Python 3
video_url = bytes ( video_url , ' ascii ' ) . decode ( ' unicode-escape ' )
2012-11-28 02:04:46 +01:00
return [ {
2012-12-17 18:33:11 +01:00
' id ' : video_id ,
2012-11-28 02:04:46 +01:00
' url ' : video_url ,
2012-12-17 18:33:11 +01:00
' uploader ' : uploader ,
' upload_date ' : upload_date ,
' title ' : video_title ,
' ext ' : video_extension ,
2012-11-28 02:04:46 +01:00
} ]
2012-12-13 21:27:57 +01:00
class NBAIE ( InfoExtractor ) :
_VALID_URL = r ' ^(?:https?://)?(?:watch \ .|www \ .)?nba \ .com/(?:nba/)?video(/[^?]*)( \ ?.*)?$ '
IE_NAME = u ' nba '
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2012-12-13 21:27:57 +01:00
return
video_id = mobj . group ( 1 )
if video_id . endswith ( ' /index.html ' ) :
video_id = video_id [ : - len ( ' /index.html ' ) ]
2013-01-01 20:52:59 +01:00
webpage = self . _download_webpage ( url , video_id )
2012-12-13 21:27:57 +01:00
video_url = u ' http://ht-mobile.cdn.turner.com/nba/big ' + video_id + ' _nba_1280x720.mp4 '
def _findProp ( rexp , default = None ) :
m = re . search ( rexp , webpage )
if m :
return unescapeHTML ( m . group ( 1 ) )
else :
return default
shortened_video_id = video_id . rpartition ( ' / ' ) [ 2 ]
title = _findProp ( r ' <meta property= " og:title " content= " (.*?) " ' , shortened_video_id ) . replace ( ' NBA.com: ' , ' ' )
info = {
' id ' : shortened_video_id ,
' url ' : video_url ,
' ext ' : ' mp4 ' ,
' title ' : title ,
' uploader_date ' : _findProp ( r ' <b>Date:</b> (.*?)</div> ' ) ,
' description ' : _findProp ( r ' <div class= " description " >(.*?)</h1> ' ) ,
}
return [ info ]
2012-12-16 09:50:41 +01:00
class JustinTVIE ( InfoExtractor ) :
""" Information extractor for justin.tv and twitch.tv """
2012-12-16 10:05:39 +01:00
# TODO: One broadcast may be split into multiple videos. The key
# 'broadcast_id' is the same for all parts, and 'broadcast_part'
# starts at 1 and increases. Can we treat all parts as one video?
2012-12-16 10:45:46 +01:00
_VALID_URL = r """ (?x)^(?:http://)?(?:www \ .)?(?:twitch|justin) \ .tv/
2013-05-04 10:33:34 +02:00
( ? :
( ? P < channelid > [ ^ / ] + ) |
( ? : ( ? : [ ^ / ] + ) / b / ( ? P < videoid > [ ^ / ] + ) ) |
( ? : ( ? : [ ^ / ] + ) / c / ( ? P < chapterid > [ ^ / ] + ) )
)
/ ? ( ? : \#.*)?$
"""
2012-12-16 10:45:46 +01:00
_JUSTIN_PAGE_LIMIT = 100
2012-12-16 09:50:41 +01:00
IE_NAME = u ' justin.tv '
2012-12-16 10:45:46 +01:00
def report_download_page ( self , channel , offset ) :
""" Report attempt to download a single page of videos. """
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' %s : Downloading video information from %d to %d ' %
( channel , offset , offset + self . _JUSTIN_PAGE_LIMIT ) )
2012-12-16 10:45:46 +01:00
2012-12-16 10:05:39 +01:00
# Return count of items, list of *valid* items
2013-05-02 18:18:27 +02:00
def _parse_page ( self , url , video_id ) :
webpage = self . _download_webpage ( url , video_id ,
u ' Downloading video info JSON ' ,
u ' unable to download video info JSON ' )
2012-12-19 15:19:08 +01:00
2012-12-16 09:50:41 +01:00
response = json . loads ( webpage )
2013-01-07 13:59:39 +01:00
if type ( response ) != list :
error_text = response . get ( ' error ' , ' unknown error ' )
2013-05-04 08:38:28 +02:00
raise ExtractorError ( u ' Justin.tv API: %s ' % error_text )
2012-12-16 09:50:41 +01:00
info = [ ]
for clip in response :
video_url = clip [ ' video_file_url ' ]
if video_url :
video_extension = os . path . splitext ( video_url ) [ 1 ] [ 1 : ]
2013-01-07 13:59:39 +01:00
video_date = re . sub ( ' - ' , ' ' , clip [ ' start_time ' ] [ : 10 ] )
video_uploader_id = clip . get ( ' user_id ' , clip . get ( ' channel_id ' ) )
2013-01-16 09:55:45 +01:00
video_id = clip [ ' id ' ]
video_title = clip . get ( ' title ' , video_id )
2012-12-16 09:50:41 +01:00
info . append ( {
2013-01-16 09:55:45 +01:00
' id ' : video_id ,
2012-12-16 09:50:41 +01:00
' url ' : video_url ,
2013-01-16 09:55:45 +01:00
' title ' : video_title ,
2013-01-07 13:59:39 +01:00
' uploader ' : clip . get ( ' channel_name ' , video_uploader_id ) ,
' uploader_id ' : video_uploader_id ,
2012-12-16 09:50:41 +01:00
' upload_date ' : video_date ,
' ext ' : video_extension ,
} )
2012-12-16 10:05:39 +01:00
return ( len ( response ) , info )
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-05-04 10:33:34 +02:00
raise ExtractorError ( u ' invalid URL: %s ' % url )
2012-12-19 15:19:08 +01:00
2013-05-04 10:33:34 +02:00
api_base = ' http://api.justin.tv '
2012-12-16 10:05:39 +01:00
paged = False
2013-05-04 10:33:34 +02:00
if mobj . group ( ' channelid ' ) :
2012-12-16 10:05:39 +01:00
paged = True
2013-05-04 10:33:34 +02:00
video_id = mobj . group ( ' channelid ' )
api = api_base + ' /channel/archives/ %s .json ' % video_id
elif mobj . group ( ' chapterid ' ) :
chapter_id = mobj . group ( ' chapterid ' )
webpage = self . _download_webpage ( url , chapter_id )
m = re . search ( r ' PP \ .archive_id = " ([0-9]+) " ; ' , webpage )
if not m :
2013-05-04 11:27:39 +02:00
raise ExtractorError ( u ' Cannot find archive of a chapter ' )
2013-05-04 10:33:34 +02:00
archive_id = m . group ( 1 )
2013-05-04 11:27:39 +02:00
api = api_base + ' /broadcast/by_chapter/ %s .xml ' % chapter_id
chapter_info_xml = self . _download_webpage ( api , chapter_id ,
note = u ' Downloading chapter information ' ,
errnote = u ' Chapter information download failed ' )
doc = xml . etree . ElementTree . fromstring ( chapter_info_xml )
for a in doc . findall ( ' .//archive ' ) :
if archive_id == a . find ( ' ./id ' ) . text :
break
else :
raise ExtractorError ( u ' Could not find chapter in chapter information ' )
video_url = a . find ( ' ./video_file_url ' ) . text
video_ext = video_url . rpartition ( ' . ' ) [ 2 ] or u ' flv '
2013-05-04 11:42:44 +02:00
chapter_api_url = u ' https://api.twitch.tv/kraken/videos/c ' + chapter_id
2013-05-04 11:44:27 +02:00
chapter_info_json = self . _download_webpage ( chapter_api_url , u ' c ' + chapter_id ,
2013-05-04 11:42:44 +02:00
note = ' Downloading chapter metadata ' ,
errnote = ' Download of chapter metadata failed ' )
chapter_info = json . loads ( chapter_info_json )
2013-05-04 11:44:27 +02:00
2013-05-04 11:27:39 +02:00
# TODO determine start (and probably fix up file)
# youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
#video_url += u'?start=' + a.find('./start_timestamp').text
self . _downloader . report_warning ( u ' Chapter detected, but we do not know how to calculate start position. Downloading the whole file ... (See https://github.com/rg3/youtube-dl/issues/810 ) ' )
2013-05-04 10:33:34 +02:00
2013-05-04 11:27:39 +02:00
info = {
' id ' : u ' c ' + chapter_id ,
' url ' : video_url ,
' ext ' : video_ext ,
2013-05-04 11:42:44 +02:00
' title ' : chapter_info [ ' title ' ] ,
' thumbnail ' : chapter_info [ ' preview ' ] ,
' description ' : chapter_info [ ' description ' ] ,
2013-05-04 11:44:27 +02:00
' uploader ' : chapter_info [ ' channel ' ] [ ' display_name ' ] ,
' uploader_id ' : chapter_info [ ' channel ' ] [ ' name ' ] ,
2013-05-04 11:27:39 +02:00
}
return [ info ]
2012-12-16 10:05:39 +01:00
else :
2013-05-04 10:33:34 +02:00
video_id = mobj . group ( ' videoid ' )
api = api_base + ' /broadcast/by_archive/ %s .json ' % video_id
2012-12-19 15:19:08 +01:00
2012-12-16 10:05:39 +01:00
self . report_extraction ( video_id )
2012-12-19 15:19:08 +01:00
2012-12-16 10:05:39 +01:00
info = [ ]
offset = 0
2012-12-16 10:45:46 +01:00
limit = self . _JUSTIN_PAGE_LIMIT
while True :
if paged :
self . report_download_page ( video_id , offset )
2012-12-16 10:05:39 +01:00
page_url = api + ( ' ?offset= %d &limit= %d ' % ( offset , limit ) )
2013-05-02 18:18:27 +02:00
page_count , page_info = self . _parse_page ( page_url , video_id )
2012-12-16 10:05:39 +01:00
info . extend ( page_info )
if not paged or page_count != limit :
break
offset + = limit
2012-12-16 09:50:41 +01:00
return info
2012-12-20 21:28:27 +01:00
class FunnyOrDieIE ( InfoExtractor ) :
_VALID_URL = r ' ^(?:https?://)?(?:www \ .)?funnyordie \ .com/videos/(?P<id>[0-9a-f]+)/.*$ '
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-05-04 08:38:28 +02:00
raise ExtractorError ( u ' invalid URL: %s ' % url )
2012-12-20 21:28:27 +01:00
video_id = mobj . group ( ' id ' )
2013-01-01 20:52:59 +01:00
webpage = self . _download_webpage ( url , video_id )
2012-12-20 21:28:27 +01:00
m = re . search ( r ' <video[^>]*> \ s*<source[^>]*> \ s*<source src= " (?P<url>[^ " ]+) " ' , webpage , re . DOTALL )
if not m :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to find video information ' )
2012-12-20 21:28:27 +01:00
video_url = unescapeHTML ( m . group ( ' url ' ) )
2013-03-29 15:59:13 +01:00
m = re . search ( r " <h1 class= ' player_page_h1 ' .*?>(?P<title>.*?)</h1> " , webpage , flags = re . DOTALL )
2012-12-20 21:28:27 +01:00
if not m :
2013-04-18 06:21:46 +02:00
m = re . search ( r ' <title>(?P<title>[^<]+?)</title> ' , webpage )
if not m :
2013-04-23 11:31:37 +02:00
self . _downloader . report_error ( u ' Cannot find video title ' )
2013-03-29 15:59:13 +01:00
title = clean_html ( m . group ( ' title ' ) )
2012-12-20 21:28:27 +01:00
m = re . search ( r ' <meta property= " og:description " content= " (?P<desc>.*?) " ' , webpage )
if m :
desc = unescapeHTML ( m . group ( ' desc ' ) )
else :
desc = None
info = {
' id ' : video_id ,
' url ' : video_url ,
' ext ' : ' mp4 ' ,
' title ' : title ,
' description ' : desc ,
}
return [ info ]
2012-12-27 01:38:41 +01:00
2013-01-01 14:12:14 +01:00
class SteamIE ( InfoExtractor ) :
2013-05-02 13:39:56 +02:00
_VALID_URL = r """ http://store \ .steampowered \ .com/
2013-04-27 11:03:34 +02:00
( agecheck / ) ?
2013-01-01 14:12:14 +01:00
( ? P < urltype > video | app ) / #If the page is only for videos or for a game
( ? P < gameID > \d + ) / ?
( ? P < videoID > \d * ) ( ? P < extra > \? ? ) #For urltype == video we sometimes get the videoID
"""
2013-01-01 19:37:07 +01:00
2013-02-26 19:02:31 +01:00
@classmethod
def suitable ( cls , url ) :
2013-01-01 14:12:14 +01:00
""" Receives a URL and returns True if suitable for this IE. """
2013-02-26 19:02:31 +01:00
return re . match ( cls . _VALID_URL , url , re . VERBOSE ) is not None
2013-01-01 20:52:59 +01:00
2013-01-01 14:12:14 +01:00
def _real_extract ( self , url ) :
m = re . match ( self . _VALID_URL , url , re . VERBOSE )
gameID = m . group ( ' gameID ' )
2013-04-21 21:56:13 +02:00
videourl = ' http://store.steampowered.com/agecheck/video/ %s /?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970 ' % gameID
self . report_age_confirmation ( )
2013-01-01 20:52:59 +01:00
webpage = self . _download_webpage ( videourl , gameID )
2013-04-21 22:05:21 +02:00
game_title = re . search ( r ' <h2 class= " pageheader " >(?P<game_title>.*?)</h2> ' , webpage ) . group ( ' game_title ' )
urlRE = r " ' movie_(?P<videoID> \ d+) ' : \ { \ s*FILENAME: \" (?P<videoURL>[ \ w:/ \ . \ ?=]+) \" (, \ s*MOVIE_NAME: \" (?P<videoName>[ \ w:/ \ . \ ?= \ +-]+) \" )? \ s* \ }, "
2013-01-01 14:12:14 +01:00
mweb = re . finditer ( urlRE , webpage )
2013-01-03 23:51:48 +01:00
namesRE = r ' <span class= " title " >(?P<videoName>.+?)</span> '
titles = re . finditer ( namesRE , webpage )
2013-02-23 16:48:15 +01:00
thumbsRE = r ' <img class= " movie_thumb " src= " (?P<thumbnail>.+?) " > '
thumbs = re . finditer ( thumbsRE , webpage )
2013-01-01 14:12:14 +01:00
videos = [ ]
2013-02-23 16:48:15 +01:00
for vid , vtitle , thumb in zip ( mweb , titles , thumbs ) :
2013-01-01 14:12:14 +01:00
video_id = vid . group ( ' videoID ' )
2013-01-01 20:52:59 +01:00
title = vtitle . group ( ' videoName ' )
video_url = vid . group ( ' videoURL ' )
2013-02-23 16:48:15 +01:00
video_thumb = thumb . group ( ' thumbnail ' )
2013-01-01 14:12:14 +01:00
if not video_url :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' Cannot find video url for %s ' % video_id )
2013-01-01 14:12:14 +01:00
info = {
' id ' : video_id ,
' url ' : video_url ,
' ext ' : ' flv ' ,
2013-02-23 16:48:15 +01:00
' title ' : unescapeHTML ( title ) ,
' thumbnail ' : video_thumb
2013-01-01 14:12:14 +01:00
}
videos . append ( info )
2013-04-21 22:05:21 +02:00
return [ self . playlist_result ( videos , gameID , game_title ) ]
2013-01-12 13:49:14 +01:00
2013-01-01 17:52:46 +01:00
class UstreamIE ( InfoExtractor ) :
2013-01-12 13:49:14 +01:00
_VALID_URL = r ' https?://www \ .ustream \ .tv/recorded/(?P<videoID> \ d+) '
2013-01-01 17:52:46 +01:00
IE_NAME = u ' ustream '
2013-01-12 13:49:14 +01:00
2013-01-01 17:52:46 +01:00
def _real_extract ( self , url ) :
m = re . match ( self . _VALID_URL , url )
video_id = m . group ( ' videoID ' )
video_url = u ' http://tcdn.ustream.tv/video/ %s ' % video_id
2013-01-01 20:43:43 +01:00
webpage = self . _download_webpage ( url , video_id )
2013-01-01 17:52:46 +01:00
m = re . search ( r ' data-title= " (?P<title>.+) " ' , webpage )
title = m . group ( ' title ' )
m = re . search ( r ' <a class= " state " data-content-type= " channel " data-content-id= " (?P<uploader> \ d+) " ' , webpage )
uploader = m . group ( ' uploader ' )
info = {
' id ' : video_id ,
' url ' : video_url ,
' ext ' : ' flv ' ,
' title ' : title ,
' uploader ' : uploader
}
return [ info ]
2013-01-01 19:37:07 +01:00
2013-03-07 06:09:55 +01:00
class WorldStarHipHopIE ( InfoExtractor ) :
2013-05-04 08:06:56 +02:00
_VALID_URL = r ' https?://(?:www|m) \ .worldstar(?:candy|hiphop) \ .com/videos/video \ .php \ ?v=(?P<id>.*) '
2013-03-07 06:09:55 +01:00
IE_NAME = u ' WorldStarHipHop '
def _real_extract ( self , url ) :
2013-05-02 22:37:26 +02:00
_src_url = r ' so \ .addVariable \ ( " file " , " (.*?) " \ ) '
2013-03-07 06:09:55 +01:00
2013-03-09 07:48:05 +01:00
m = re . match ( self . _VALID_URL , url )
video_id = m . group ( ' id ' )
2013-05-02 18:18:27 +02:00
webpage_src = self . _download_webpage ( url , video_id )
mobj = re . search ( _src_url , webpage_src )
2013-03-07 06:09:55 +01:00
if mobj is not None :
2013-05-02 22:37:26 +02:00
video_url = mobj . group ( 1 )
2013-03-07 06:09:55 +01:00
if ' mp4 ' in video_url :
2013-03-08 00:39:17 +01:00
ext = ' mp4 '
2013-03-07 06:09:55 +01:00
else :
2013-03-08 00:39:17 +01:00
ext = ' flv '
2013-03-07 06:09:55 +01:00
else :
2013-05-04 08:06:56 +02:00
raise ExtractorError ( u ' Cannot find video url for %s ' % video_id )
2013-03-31 03:02:05 +02:00
2013-05-04 08:06:56 +02:00
mobj = re . search ( r " <title>(.*)</title> " , webpage_src )
2013-03-07 06:09:55 +01:00
2013-05-04 08:06:56 +02:00
if mobj is None :
raise ExtractorError ( u ' Cannot determine title ' )
title = mobj . group ( 1 )
2013-03-07 06:09:55 +01:00
2013-05-04 08:06:56 +02:00
mobj = re . search ( r ' rel= " image_src " href= " (.*) " /> ' , webpage_src )
2013-03-07 06:09:55 +01:00
# Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
if mobj is not None :
thumbnail = mobj . group ( 1 )
else :
_title = r """ candytitles.*>(.*)</span> """
mobj = re . search ( _title , webpage_src )
if mobj is not None :
title = mobj . group ( 1 )
thumbnail = None
2013-03-31 03:02:05 +02:00
2013-03-08 00:39:17 +01:00
results = [ {
2013-03-08 01:27:21 +01:00
' id ' : video_id ,
2013-03-08 00:39:17 +01:00
' url ' : video_url ,
' title ' : title ,
' thumbnail ' : thumbnail ,
' ext ' : ext ,
} ]
2013-03-07 06:09:55 +01:00
return results
2013-01-12 17:58:39 +01:00
class RBMARadioIE ( InfoExtractor ) :
_VALID_URL = r ' https?://(?:www \ .)?rbmaradio \ .com/shows/(?P<videoID>[^/]+)$ '
def _real_extract ( self , url ) :
m = re . match ( self . _VALID_URL , url )
video_id = m . group ( ' videoID ' )
webpage = self . _download_webpage ( url , video_id )
m = re . search ( r ' <script>window.gon = { .*?};gon \ .show=(.+?);</script> ' , webpage )
if not m :
raise ExtractorError ( u ' Cannot find metadata ' )
json_data = m . group ( 1 )
try :
data = json . loads ( json_data )
except ValueError as e :
raise ExtractorError ( u ' Invalid JSON: ' + str ( e ) )
video_url = data [ ' akamai_url ' ] + ' &cbr=256 '
url_parts = compat_urllib_parse_urlparse ( video_url )
video_ext = url_parts . path . rpartition ( ' . ' ) [ 2 ]
info = {
' id ' : video_id ,
' url ' : video_url ,
' ext ' : video_ext ,
' title ' : data [ ' title ' ] ,
' description ' : data . get ( ' teaser_text ' ) ,
' location ' : data . get ( ' country_of_origin ' ) ,
' uploader ' : data . get ( ' host ' , { } ) . get ( ' name ' ) ,
' uploader_id ' : data . get ( ' host ' , { } ) . get ( ' slug ' ) ,
2013-01-12 18:45:50 +01:00
' thumbnail ' : data . get ( ' image ' , { } ) . get ( ' large_url_2x ' ) ,
2013-01-12 17:58:39 +01:00
' duration ' : data . get ( ' duration ' ) ,
}
return [ info ]
2013-01-01 19:37:07 +01:00
2013-01-05 21:42:35 +01:00
class YouPornIE ( InfoExtractor ) :
""" Information extractor for youporn.com. """
_VALID_URL = r ' ^(?:https?://)?(?: \ w+ \ .)?youporn \ .com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+) '
2013-02-26 10:39:26 +01:00
2013-01-05 21:42:35 +01:00
def _print_formats ( self , formats ) :
""" Print all available formats """
2013-01-12 15:17:04 +01:00
print ( u ' Available formats: ' )
2013-01-06 21:40:50 +01:00
print ( u ' ext \t \t format ' )
print ( u ' --------------------------------- ' )
2013-01-05 21:42:35 +01:00
for format in formats :
2013-01-06 21:40:50 +01:00
print ( u ' %s \t \t %s ' % ( format [ ' ext ' ] , format [ ' format ' ] ) )
2013-01-05 21:42:35 +01:00
def _specific ( self , req_format , formats ) :
for x in formats :
if ( x [ " format " ] == req_format ) :
return x
return None
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2013-01-05 21:42:35 +01:00
return
2013-01-06 21:40:50 +01:00
video_id = mobj . group ( ' videoid ' )
2013-01-05 21:42:35 +01:00
2013-01-12 16:10:35 +01:00
req = compat_urllib_request . Request ( url )
req . add_header ( ' Cookie ' , ' age_verified=1 ' )
webpage = self . _download_webpage ( req , video_id )
2013-01-05 21:42:35 +01:00
# Get the video title
2013-02-18 23:30:33 +01:00
result = re . search ( r ' <h1.*?>(?P<title>.*)</h1> ' , webpage )
2013-01-05 21:42:35 +01:00
if result is None :
2013-02-18 23:30:33 +01:00
raise ExtractorError ( u ' Unable to extract video title ' )
2013-01-06 21:40:50 +01:00
video_title = result . group ( ' title ' ) . strip ( )
2013-01-05 21:42:35 +01:00
# Get the video date
2013-02-18 23:30:33 +01:00
result = re . search ( r ' Date:</label>(?P<date>.*) </li> ' , webpage )
2013-01-05 21:42:35 +01:00
if result is None :
2013-03-02 11:24:07 +01:00
self . _downloader . report_warning ( u ' unable to extract video date ' )
2013-01-12 16:10:35 +01:00
upload_date = None
else :
2013-04-27 15:14:20 +02:00
upload_date = unified_strdate ( result . group ( ' date ' ) . strip ( ) )
2013-01-05 21:42:35 +01:00
# Get the video uploader
2013-02-18 23:30:33 +01:00
result = re . search ( r ' Submitted:</label>(?P<uploader>.*)</li> ' , webpage )
2013-01-05 21:42:35 +01:00
if result is None :
2013-03-02 11:24:07 +01:00
self . _downloader . report_warning ( u ' unable to extract uploader ' )
2013-01-12 16:10:35 +01:00
video_uploader = None
else :
video_uploader = result . group ( ' uploader ' ) . strip ( )
video_uploader = clean_html ( video_uploader )
2013-01-05 21:42:35 +01:00
# Get all of the formats available
2013-01-06 21:40:50 +01:00
DOWNLOAD_LIST_RE = r ' (?s)<ul class= " downloadList " >(?P<download_list>.*?)</ul> '
result = re . search ( DOWNLOAD_LIST_RE , webpage )
2013-01-05 21:42:35 +01:00
if result is None :
2013-01-12 16:10:35 +01:00
raise ExtractorError ( u ' Unable to extract download list ' )
2013-01-06 21:40:50 +01:00
download_list_html = result . group ( ' download_list ' ) . strip ( )
2013-01-05 21:42:35 +01:00
# Get all of the links from the page
2013-01-06 21:40:50 +01:00
LINK_RE = r ' (?s)<a href= " (?P<url>[^ " ]+) " > '
links = re . findall ( LINK_RE , download_list_html )
2013-01-05 21:42:35 +01:00
if ( len ( links ) == 0 ) :
2013-01-12 16:10:35 +01:00
raise ExtractorError ( u ' ERROR: no known formats available for video ' )
2013-02-26 10:39:26 +01:00
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Links found: %d ' % len ( links ) )
2013-01-05 21:42:35 +01:00
formats = [ ]
for link in links :
# A link looks like this:
# http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
# A path looks like this:
# /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2013-01-06 21:40:50 +01:00
video_url = unescapeHTML ( link )
path = compat_urllib_parse_urlparse ( video_url ) . path
2013-01-05 21:42:35 +01:00
extension = os . path . splitext ( path ) [ 1 ] [ 1 : ]
format = path . split ( ' / ' ) [ 4 ] . split ( ' _ ' ) [ : 2 ]
size = format [ 0 ]
bitrate = format [ 1 ]
format = " - " . join ( format )
title = u ' %s - %s - %s ' % ( video_title , size , bitrate )
formats . append ( {
' id ' : video_id ,
' url ' : video_url ,
' uploader ' : video_uploader ,
' upload_date ' : upload_date ,
' title ' : title ,
' ext ' : extension ,
' format ' : format ,
' thumbnail ' : None ,
' description ' : None ,
' player_url ' : None
} )
if self . _downloader . params . get ( ' listformats ' , None ) :
self . _print_formats ( formats )
return
req_format = self . _downloader . params . get ( ' format ' , None )
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Format: %s ' % req_format )
2013-01-05 21:42:35 +01:00
if req_format is None or req_format == ' best ' :
return [ formats [ 0 ] ]
elif req_format == ' worst ' :
return [ formats [ - 1 ] ]
elif req_format in ( ' -1 ' , ' all ' ) :
return formats
else :
format = self . _specific ( req_format , formats )
if result is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' requested format not available ' )
2013-01-05 21:42:35 +01:00
return
return [ format ]
2013-02-26 10:39:26 +01:00
2013-01-05 21:42:35 +01:00
class PornotubeIE ( InfoExtractor ) :
""" Information extractor for pornotube.com. """
_VALID_URL = r ' ^(?:https?://)?(?: \ w+ \ .)?pornotube \ .com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$ '
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2013-01-05 21:42:35 +01:00
return
2013-01-06 21:40:50 +01:00
video_id = mobj . group ( ' videoid ' )
video_title = mobj . group ( ' title ' )
2013-01-05 21:42:35 +01:00
# Get webpage content
2013-01-06 21:40:50 +01:00
webpage = self . _download_webpage ( url , video_id )
2013-01-05 21:42:35 +01:00
# Get the video URL
2013-01-06 21:40:50 +01:00
VIDEO_URL_RE = r ' url: " (?P<url>http://video[0-9].pornotube.com/.+ \ .flv) " , '
result = re . search ( VIDEO_URL_RE , webpage )
2013-01-05 21:42:35 +01:00
if result is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video url ' )
2013-01-05 21:42:35 +01:00
return
2013-01-06 21:40:50 +01:00
video_url = compat_urllib_parse . unquote ( result . group ( ' url ' ) )
2013-01-05 21:42:35 +01:00
#Get the uploaded date
2013-01-06 21:40:50 +01:00
VIDEO_UPLOADED_RE = r ' <div class= " video_added_by " >Added (?P<date>[0-9 \ /]+) by '
result = re . search ( VIDEO_UPLOADED_RE , webpage )
2013-01-05 21:42:35 +01:00
if result is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract video title ' )
2013-01-05 21:42:35 +01:00
return
2013-04-27 15:14:20 +02:00
upload_date = unified_strdate ( result . group ( ' date ' ) )
2013-01-05 21:42:35 +01:00
info = { ' id ' : video_id ,
' url ' : video_url ,
' uploader ' : None ,
' upload_date ' : upload_date ,
' title ' : video_title ,
' ext ' : ' flv ' ,
2013-01-12 15:17:04 +01:00
' format ' : ' flv ' }
2013-01-05 21:42:35 +01:00
return [ info ]
class YouJizzIE ( InfoExtractor ) :
""" Information extractor for youjizz.com. """
2013-01-06 21:40:50 +01:00
_VALID_URL = r ' ^(?:https?://)?(?: \ w+ \ .)?youjizz \ .com/videos/(?P<videoid>[^.]+).html$ '
2013-01-05 21:42:35 +01:00
def _real_extract ( self , url ) :
2013-01-06 21:40:50 +01:00
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2013-01-05 21:42:35 +01:00
return
2013-01-06 21:40:50 +01:00
video_id = mobj . group ( ' videoid ' )
# Get webpage content
webpage = self . _download_webpage ( url , video_id )
2013-01-05 21:42:35 +01:00
# Get the video title
2013-01-12 16:36:51 +01:00
result = re . search ( r ' <title>(?P<title>.*)</title> ' , webpage )
2013-01-05 21:42:35 +01:00
if result is None :
2013-01-12 16:36:51 +01:00
raise ExtractorError ( u ' ERROR: unable to extract video title ' )
2013-01-06 21:40:50 +01:00
video_title = result . group ( ' title ' ) . strip ( )
2013-01-05 21:42:35 +01:00
# Get the embed page
2013-01-12 16:36:51 +01:00
result = re . search ( r ' https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+) ' , webpage )
2013-01-05 21:42:35 +01:00
if result is None :
2013-01-12 16:36:51 +01:00
raise ExtractorError ( u ' ERROR: unable to extract embed page ' )
2013-01-05 21:42:35 +01:00
2013-01-06 21:40:50 +01:00
embed_page_url = result . group ( 0 ) . strip ( )
video_id = result . group ( ' videoid ' )
2013-02-26 10:39:26 +01:00
2013-01-06 21:40:50 +01:00
webpage = self . _download_webpage ( embed_page_url , video_id )
2013-01-05 21:42:35 +01:00
# Get the video URL
2013-01-12 16:36:51 +01:00
result = re . search ( r ' so.addVariable \ ( " file " ,encodeURIComponent \ ( " (?P<source>[^ " ]+) " \ ) \ ); ' , webpage )
2013-01-05 21:42:35 +01:00
if result is None :
2013-01-12 16:36:51 +01:00
raise ExtractorError ( u ' ERROR: unable to extract video url ' )
2013-01-06 21:40:50 +01:00
video_url = result . group ( ' source ' )
2013-01-05 21:42:35 +01:00
info = { ' id ' : video_id ,
' url ' : video_url ,
' title ' : video_title ,
' ext ' : ' flv ' ,
' format ' : ' flv ' ,
' player_url ' : embed_page_url }
return [ info ]
2013-01-27 03:01:23 +01:00
class EightTracksIE ( InfoExtractor ) :
IE_NAME = ' 8tracks '
2013-01-27 04:15:12 +01:00
_VALID_URL = r ' https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$ '
2013-01-27 03:01:23 +01:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
raise ExtractorError ( u ' Invalid URL: %s ' % url )
playlist_id = mobj . group ( ' id ' )
webpage = self . _download_webpage ( url , playlist_id )
2013-02-18 19:11:32 +01:00
m = re . search ( r " PAGE.mix = (.*?); \ n " , webpage , flags = re . DOTALL )
2013-01-27 03:01:23 +01:00
if not m :
raise ExtractorError ( u ' Cannot find trax information ' )
json_like = m . group ( 1 )
data = json . loads ( json_like )
session = str ( random . randint ( 0 , 1000000000 ) )
mix_id = data [ ' id ' ]
track_count = data [ ' tracks_count ' ]
first_url = ' http://8tracks.com/sets/ %s /play?player=sm&mix_id= %s &format=jsonh ' % ( session , mix_id )
next_url = first_url
res = [ ]
for i in itertools . count ( ) :
api_json = self . _download_webpage ( next_url , playlist_id ,
note = u ' Downloading song information %s / %s ' % ( str ( i + 1 ) , track_count ) ,
errnote = u ' Failed to download song information ' )
api_data = json . loads ( api_json )
track_data = api_data [ u ' set ' ] [ ' track ' ]
info = {
' id ' : track_data [ ' id ' ] ,
' url ' : track_data [ ' track_file_stream_url ' ] ,
2013-01-27 04:05:53 +01:00
' title ' : track_data [ ' performer ' ] + u ' - ' + track_data [ ' name ' ] ,
' raw_title ' : track_data [ ' name ' ] ,
' uploader_id ' : data [ ' user ' ] [ ' login ' ] ,
2013-01-27 03:01:23 +01:00
' ext ' : ' m4a ' ,
}
res . append ( info )
if api_data [ ' set ' ] [ ' at_last_track ' ] :
break
next_url = ' http://8tracks.com/sets/ %s /next?player=sm&mix_id= %s &format=jsonh&track_id= %s ' % ( session , mix_id , track_data [ ' id ' ] )
return res
2013-01-05 21:42:35 +01:00
2013-02-08 08:25:55 +01:00
class KeekIE ( InfoExtractor ) :
_VALID_URL = r ' http://(?:www \ .)?keek \ .com/(?:!| \ w+/keeks/)(?P<videoID> \ w+) '
IE_NAME = u ' keek '
def _real_extract ( self , url ) :
m = re . match ( self . _VALID_URL , url )
video_id = m . group ( ' videoID ' )
video_url = u ' http://cdn.keek.com/keek/video/ %s ' % video_id
thumbnail = u ' http://cdn.keek.com/keek/thumbnail/ %s /w100/h75 ' % video_id
webpage = self . _download_webpage ( url , video_id )
2013-04-11 18:39:13 +02:00
m = re . search ( r ' <meta property= " og:title " content= " (?P<title>.*?) " ' , webpage )
2013-02-08 11:00:28 +01:00
title = unescapeHTML ( m . group ( ' title ' ) )
2013-03-20 12:13:52 +01:00
m = re . search ( r ' <div class= " user-name-and-bio " >[ \ S \ s]+?<h2>(?P<uploader>.+?)</h2> ' , webpage )
uploader = clean_html ( m . group ( ' uploader ' ) )
2013-02-08 08:25:55 +01:00
info = {
2013-03-20 12:13:52 +01:00
' id ' : video_id ,
' url ' : video_url ,
2013-02-08 08:25:55 +01:00
' ext ' : ' mp4 ' ,
' title ' : title ,
' thumbnail ' : thumbnail ,
' uploader ' : uploader
2013-02-08 11:00:28 +01:00
}
2013-02-08 08:25:55 +01:00
return [ info ]
2013-02-17 17:13:06 +01:00
class TEDIE ( InfoExtractor ) :
2013-05-02 13:39:56 +02:00
_VALID_URL = r ''' http://www \ .ted \ .com/
2013-02-18 21:42:06 +01:00
(
( ( ? P < type_playlist > playlists ) / ( ? P < playlist_id > \d + ) ) # We have a playlist
|
( ( ? P < type_talk > talks ) ) # We have a simple talk
)
2013-05-02 18:28:07 +02:00
( / lang / ( . * ? ) ) ? # The url may contain the language
2013-02-18 21:42:06 +01:00
/ ( ? P < name > \w + ) # Here goes the name and then ".html"
'''
2013-02-26 19:02:31 +01:00
@classmethod
def suitable ( cls , url ) :
2013-02-18 21:42:06 +01:00
""" Receives a URL and returns True if suitable for this IE. """
2013-02-26 19:02:31 +01:00
return re . match ( cls . _VALID_URL , url , re . VERBOSE ) is not None
2013-02-18 21:42:06 +01:00
2013-02-17 17:13:06 +01:00
def _real_extract ( self , url ) :
2013-02-18 21:42:06 +01:00
m = re . match ( self . _VALID_URL , url , re . VERBOSE )
if m . group ( ' type_talk ' ) :
return [ self . _talk_info ( url ) ]
else :
playlist_id = m . group ( ' playlist_id ' )
name = m . group ( ' name ' )
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' Getting info of playlist %s : " %s " ' % ( playlist_id , name ) )
2013-04-20 13:31:21 +02:00
return [ self . _playlist_videos_info ( url , name , playlist_id ) ]
2013-02-18 21:42:06 +01:00
def _talk_video_link ( self , mediaSlug ) :
''' Returns the video link for that mediaSlug '''
return ' http://download.ted.com/talks/ %s .mp4 ' % mediaSlug
def _playlist_videos_info ( self , url , name , playlist_id = 0 ) :
''' Returns the videos of the playlist '''
video_RE = r '''
< li \ id = " talk_( \ d+) " ( [ . \s ] * ? ) data - id = " (?P<video_id> \ d+) "
( [ . \s ] * ? ) data - playlist_item_id = " ( \ d+) "
( [ . \s ] * ? ) data - mediaslug = " (?P<mediaSlug>.+?) "
'''
2013-02-23 17:27:49 +01:00
video_name_RE = r ' <p \ class= " talk-title " ><a href= " (?P<talk_url>/talks/(.+).html) " >(?P<fullname>.+?)</a></p> '
2013-02-18 21:42:06 +01:00
webpage = self . _download_webpage ( url , playlist_id , ' Downloading playlist webpage ' )
m_videos = re . finditer ( video_RE , webpage , re . VERBOSE )
m_names = re . finditer ( video_name_RE , webpage )
2013-04-20 13:31:21 +02:00
playlist_RE = r ' div class= " headline " >( \ s*?)<h1>( \ s*?)<span>(?P<playlist_title>.*?)</span> '
m_playlist = re . search ( playlist_RE , webpage )
playlist_title = m_playlist . group ( ' playlist_title ' )
playlist_entries = [ ]
2013-02-18 21:42:06 +01:00
for m_video , m_name in zip ( m_videos , m_names ) :
2013-02-23 17:27:49 +01:00
video_id = m_video . group ( ' video_id ' )
talk_url = ' http://www.ted.com %s ' % m_name . group ( ' talk_url ' )
2013-04-20 13:31:21 +02:00
playlist_entries . append ( self . url_result ( talk_url , ' TED ' ) )
return self . playlist_result ( playlist_entries , playlist_id = playlist_id , playlist_title = playlist_title )
2013-02-23 17:27:49 +01:00
2013-02-18 21:42:06 +01:00
def _talk_info ( self , url , video_id = 0 ) :
""" Return the video for the talk in the url """
m = re . match ( self . _VALID_URL , url , re . VERBOSE )
videoName = m . group ( ' name ' )
webpage = self . _download_webpage ( url , video_id , ' Downloading \" %s \" page ' % videoName )
# If the url includes the language we get the title translated
2013-03-29 15:59:13 +01:00
title_RE = r ' <span id= " altHeadline " >(?P<title>.*)</span> '
2013-02-17 17:13:06 +01:00
title = re . search ( title_RE , webpage ) . group ( ' title ' )
info_RE = r ''' <script \ type= " text/javascript " >var \ talkDetails \ =(.*?)
" id " : ( ? P < videoID > [ \d ] + ) . * ?
" mediaSlug " : " (?P<mediaSlug>[ \ w \ d]+?) " '''
2013-02-23 17:27:49 +01:00
thumb_RE = r ' </span>[ \ s.]*</div>[ \ s.]*<img src= " (?P<thumbnail>.*?) " '
thumb_match = re . search ( thumb_RE , webpage )
2013-02-17 17:13:06 +01:00
info_match = re . search ( info_RE , webpage , re . VERBOSE )
video_id = info_match . group ( ' videoID ' )
mediaSlug = info_match . group ( ' mediaSlug ' )
2013-02-18 21:42:06 +01:00
video_url = self . _talk_video_link ( mediaSlug )
2013-02-17 17:13:06 +01:00
info = {
2013-02-18 21:42:06 +01:00
' id ' : video_id ,
' url ' : video_url ,
2013-02-17 17:13:06 +01:00
' ext ' : ' mp4 ' ,
2013-02-23 17:27:49 +01:00
' title ' : title ,
' thumbnail ' : thumb_match . group ( ' thumbnail ' )
2013-02-18 21:42:06 +01:00
}
return info
2013-02-08 08:25:55 +01:00
2013-02-18 18:45:09 +01:00
class MySpassIE ( InfoExtractor ) :
2013-02-16 13:46:13 +01:00
_VALID_URL = r ' http://www.myspass.de/.* '
2013-02-26 10:39:26 +01:00
2013-02-16 13:46:13 +01:00
def _real_extract ( self , url ) :
META_DATA_URL_TEMPLATE = ' http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id= %s '
2013-02-18 18:45:09 +01:00
2013-02-16 13:46:13 +01:00
# video id is the last path element of the URL
# usually there is a trailing slash, so also try the second but last
url_path = compat_urllib_parse_urlparse ( url ) . path
url_parent_path , video_id = os . path . split ( url_path )
if not video_id :
_ , video_id = os . path . split ( url_parent_path )
2013-02-26 10:39:26 +01:00
2013-02-16 13:46:13 +01:00
# get metadata
metadata_url = META_DATA_URL_TEMPLATE % video_id
metadata_text = self . _download_webpage ( metadata_url , video_id )
metadata = xml . etree . ElementTree . fromstring ( metadata_text . encode ( ' utf-8 ' ) )
2013-02-26 10:39:26 +01:00
2013-02-16 13:46:13 +01:00
# extract values from metadata
url_flv_el = metadata . find ( ' url_flv ' )
if url_flv_el is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract download url ' )
2013-02-16 13:46:13 +01:00
return
video_url = url_flv_el . text
extension = os . path . splitext ( video_url ) [ 1 ] [ 1 : ]
title_el = metadata . find ( ' title ' )
if title_el is None :
2013-03-04 15:56:14 +01:00
self . _downloader . report_error ( u ' unable to extract title ' )
2013-02-16 13:46:13 +01:00
return
title = title_el . text
format_id_el = metadata . find ( ' format_id ' )
if format_id_el is None :
format = ext
else :
format = format_id_el . text
description_el = metadata . find ( ' description ' )
if description_el is not None :
description = description_el . text
else :
description = None
imagePreview_el = metadata . find ( ' imagePreview ' )
if imagePreview_el is not None :
thumbnail = imagePreview_el . text
else :
thumbnail = None
info = {
' id ' : video_id ,
' url ' : video_url ,
' title ' : title ,
' ext ' : extension ,
' format ' : format ,
' thumbnail ' : thumbnail ,
' description ' : description
}
return [ info ]
2013-03-12 01:08:54 +01:00
class SpiegelIE ( InfoExtractor ) :
2013-03-29 15:31:38 +01:00
_VALID_URL = r ' https?://(?:www \ .)?spiegel \ .de/video/[^/]*-(?P<videoID>[0-9]+)(?: \ .html)?(?:#.*)?$ '
2013-03-12 01:08:54 +01:00
def _real_extract ( self , url ) :
m = re . match ( self . _VALID_URL , url )
video_id = m . group ( ' videoID ' )
webpage = self . _download_webpage ( url , video_id )
m = re . search ( r ' <div class= " spVideoTitle " >(.*?)</div> ' , webpage )
if not m :
raise ExtractorError ( u ' Cannot find title ' )
video_title = unescapeHTML ( m . group ( 1 ) )
xml_url = u ' http://video2.spiegel.de/flash/ ' + video_id + u ' .xml '
xml_code = self . _download_webpage ( xml_url , video_id ,
note = u ' Downloading XML ' , errnote = u ' Failed to download XML ' )
idoc = xml . etree . ElementTree . fromstring ( xml_code )
last_type = idoc [ - 1 ]
filename = last_type . findall ( ' ./filename ' ) [ 0 ] . text
duration = float ( last_type . findall ( ' ./duration ' ) [ 0 ] . text )
video_url = ' http://video2.spiegel.de/flash/ ' + filename
video_ext = filename . rpartition ( ' . ' ) [ 2 ]
info = {
' id ' : video_id ,
' url ' : video_url ,
' ext ' : video_ext ,
' title ' : video_title ,
' duration ' : duration ,
}
return [ info ]
2013-03-29 15:13:24 +01:00
class LiveLeakIE ( InfoExtractor ) :
2013-03-26 21:37:08 +01:00
2013-03-29 15:13:24 +01:00
_VALID_URL = r ' ^(?:http?://)?(?: \ w+ \ .)?liveleak \ .com/view \ ?(?:.*?)i=(?P<video_id>[ \ w_]+)(?:.*) '
2013-03-26 21:37:08 +01:00
IE_NAME = u ' liveleak '
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-04-23 11:31:37 +02:00
self . _downloader . report_error ( u ' invalid URL: %s ' % url )
2013-03-26 21:37:08 +01:00
return
2013-03-29 15:13:24 +01:00
video_id = mobj . group ( ' video_id ' )
2013-03-26 21:37:08 +01:00
webpage = self . _download_webpage ( url , video_id )
2013-03-29 15:13:24 +01:00
m = re . search ( r ' file: " (.*?) " , ' , webpage )
if not m :
self . _downloader . report_error ( u ' unable to find video url ' )
return
video_url = m . group ( 1 )
2013-03-26 21:37:08 +01:00
m = re . search ( r ' <meta property= " og:title " content= " (?P<title>.*?) " ' , webpage )
if not m :
2013-04-23 11:31:37 +02:00
self . _downloader . report_error ( u ' Cannot find video title ' )
2013-03-29 15:13:24 +01:00
title = unescapeHTML ( m . group ( ' title ' ) ) . replace ( ' LiveLeak.com - ' , ' ' ) . strip ( )
2013-03-26 21:37:08 +01:00
m = re . search ( r ' <meta property= " og:description " content= " (?P<desc>.*?) " ' , webpage )
if m :
desc = unescapeHTML ( m . group ( ' desc ' ) )
else :
desc = None
2013-03-29 15:13:24 +01:00
m = re . search ( r ' By:.*?( \ w+)</a> ' , webpage )
if m :
uploader = clean_html ( m . group ( 1 ) )
else :
uploader = None
2013-03-26 21:37:08 +01:00
info = {
' id ' : video_id ,
' url ' : video_url ,
' ext ' : ' mp4 ' ,
' title ' : title ,
2013-03-29 15:13:24 +01:00
' description ' : desc ,
' uploader ' : uploader
2013-03-26 21:37:08 +01:00
}
return [ info ]
2013-04-07 15:23:48 +02:00
class ARDIE ( InfoExtractor ) :
2013-04-11 10:47:21 +02:00
_VALID_URL = r ' ^(?:https?://)?(?:(?:www \ .)?ardmediathek \ .de|mediathek \ .daserste \ .de)/(?:.*/)(?P<video_id>[^/ \ ?]+)(?: \ ?.*)? '
_TITLE = r ' <h1(?: class= " boxTopHeadline " )?>(?P<title>.*)</h1> '
2013-04-07 15:23:48 +02:00
_MEDIA_STREAM = r ' mediaCollection \ .addMediaStream \ ((?P<media_type> \ d+), (?P<quality> \ d+), " (?P<rtmp_url>[^ " ]*) " , " (?P<video_url>[^ " ]*) " , " [^ " ]* " \ ) '
def _real_extract ( self , url ) :
# determine video id from url
m = re . match ( self . _VALID_URL , url )
2013-04-11 10:47:21 +02:00
numid = re . search ( r ' documentId=([0-9]+) ' , url )
if numid :
video_id = numid . group ( 1 )
else :
video_id = m . group ( ' video_id ' )
2013-04-07 15:23:48 +02:00
# determine title and media streams from webpage
html = self . _download_webpage ( url , video_id )
title = re . search ( self . _TITLE , html ) . group ( ' title ' )
streams = [ m . groupdict ( ) for m in re . finditer ( self . _MEDIA_STREAM , html ) ]
if not streams :
assert ' " fsk " ' in html
self . _downloader . report_error ( u ' this video is only available after 8:00 pm ' )
return
# choose default media type and highest quality for now
2013-04-11 10:47:21 +02:00
stream = max ( [ s for s in streams if int ( s [ " media_type " ] ) == 0 ] ,
key = lambda s : int ( s [ " quality " ] ) )
2013-04-07 15:23:48 +02:00
# there's two possibilities: RTMP stream or HTTP download
info = { ' id ' : video_id , ' title ' : title , ' ext ' : ' mp4 ' }
if stream [ ' rtmp_url ' ] :
2013-04-20 19:35:49 +02:00
self . to_screen ( u ' RTMP download detected ' )
2013-04-07 15:23:48 +02:00
assert stream [ ' video_url ' ] . startswith ( ' mp4: ' )
info [ " url " ] = stream [ " rtmp_url " ]
info [ " play_path " ] = stream [ ' video_url ' ]
else :
assert stream [ " video_url " ] . endswith ( ' .mp4 ' )
info [ " url " ] = stream [ " video_url " ]
return [ info ]
2013-04-22 21:07:49 +02:00
class TumblrIE ( InfoExtractor ) :
2013-05-02 13:39:56 +02:00
_VALID_URL = r ' http://(?P<blog_name>.*?) \ .tumblr \ .com/((post)|(video))/(?P<id> \ d*)/(.*?) '
2013-04-22 21:07:49 +02:00
def _real_extract ( self , url ) :
m_url = re . match ( self . _VALID_URL , url )
video_id = m_url . group ( ' id ' )
blog = m_url . group ( ' blog_name ' )
url = ' http:// %s .tumblr.com/post/ %s / ' % ( blog , video_id )
webpage = self . _download_webpage ( url , video_id )
2013-05-02 13:39:56 +02:00
re_video = r ' src= \\ x22(?P<video_url>http:// %s \ .tumblr \ .com/video_file/ %s /(.*?)) \\ x22 type= \\ x22video/(?P<ext>.*?) \\ x22 ' % ( blog , video_id )
2013-04-22 21:07:49 +02:00
video = re . search ( re_video , webpage )
if video is None :
self . to_screen ( " No video founded " )
return [ ]
video_url = video . group ( ' video_url ' )
ext = video . group ( ' ext ' )
re_thumb = r ' posters(.*?) \ [ \\ x22(?P<thumb>.*?) \\ x22 ' # We pick the first poster
thumb = re . search ( re_thumb , webpage ) . group ( ' thumb ' ) . replace ( ' \\ ' , ' ' )
# The only place where you can get a title, it's not complete,
# but searching in other places doesn't work for all videos
2013-05-02 13:12:41 +02:00
re_title = r ' <title>(?P<title>.*?)</title> '
title = unescapeHTML ( re . search ( re_title , webpage , re . DOTALL ) . group ( ' title ' ) )
2013-04-22 21:07:49 +02:00
return [ { ' id ' : video_id ,
' url ' : video_url ,
' title ' : title ,
' thumbnail ' : thumb ,
' ext ' : ext
} ]
2013-05-01 15:55:46 +02:00
class BandcampIE ( InfoExtractor ) :
2013-05-02 13:39:56 +02:00
_VALID_URL = r ' http://.*? \ .bandcamp \ .com/track/(?P<title>.*) '
2013-05-01 15:55:46 +02:00
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
title = mobj . group ( ' title ' )
webpage = self . _download_webpage ( url , title )
# We get the link to the free download page
m_download = re . search ( r ' freeDownloadPage: " (.*?) " ' , webpage )
if m_download is None :
self . _downloader . report_error ( ' No free songs founded ' )
return
download_link = m_download . group ( 1 )
id = re . search ( r ' var TralbumData = { (.*?)id: (?P<id> \ d*?)$ ' ,
webpage , re . MULTILINE | re . DOTALL ) . group ( ' id ' )
download_webpage = self . _download_webpage ( download_link , id ,
' Downloading free downloads page ' )
# We get the dictionary of the track from some javascrip code
info = re . search ( r ' items: (.*?),$ ' ,
download_webpage , re . MULTILINE ) . group ( 1 )
info = json . loads ( info ) [ 0 ]
# We pick mp3-320 for now, until format selection can be easily implemented.
mp3_info = info [ u ' downloads ' ] [ u ' mp3-320 ' ]
# If we try to use this url it says the link has expired
initial_url = mp3_info [ u ' url ' ]
2013-05-02 13:39:56 +02:00
re_url = r ' (?P<server>http://(.*?) \ .bandcamp \ .com)/download/track \ ?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$ '
2013-05-01 15:55:46 +02:00
m_url = re . match ( re_url , initial_url )
#We build the url we will use to get the final track url
# This url is build in Bandcamp in the script download_bunde_*.js
request_url = ' %s /statdownload/track?enc=mp3-320&fsig= %s &id= %s &ts= %s &.rand=665028774616&.vrs=1 ' % ( m_url . group ( ' server ' ) , m_url . group ( ' fsig ' ) , id , m_url . group ( ' ts ' ) )
final_url_webpage = self . _download_webpage ( request_url , id , ' Requesting download url ' )
# If we could correctly generate the .rand field the url would be
#in the "download_url" key
final_url = re . search ( r ' " retry_url " : " (.*?) " ' , final_url_webpage ) . group ( 1 )
track_info = { ' id ' : id ,
' title ' : info [ u ' title ' ] ,
' ext ' : ' mp3 ' ,
' url ' : final_url ,
' thumbnail ' : info [ u ' thumb_url ' ] ,
' uploader ' : info [ u ' artist ' ]
}
return [ track_info ]
2013-05-03 20:07:35 +02:00
class RedTubeIE ( InfoExtractor ) :
2013-05-03 19:57:16 +02:00
""" Information Extractor for redtube """
_VALID_URL = r ' (?:http://)?(?:www \ .)?redtube \ .com/(?P<id>[0-9]+) '
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
if mobj is None :
2013-05-03 20:07:35 +02:00
raise ExtractorError ( u ' Invalid URL: %s ' % url )
2013-05-03 19:57:16 +02:00
video_id = mobj . group ( ' id ' )
video_extension = ' mp4 '
webpage = self . _download_webpage ( url , video_id )
self . report_extraction ( video_id )
mobj = re . search ( r ' <source src= " ' + ' (.+) ' + ' " type= " video/mp4 " > ' , webpage )
2013-05-03 20:07:35 +02:00
if mobj is None :
raise ExtractorError ( u ' Unable to extract media URL ' )
video_url = mobj . group ( 1 )
mobj = re . search ( ' <h1 class= " videoTitle slidePanelMovable " >(.+)</h1> ' , webpage )
if mobj is None :
raise ExtractorError ( u ' Unable to extract title ' )
video_title = mobj . group ( 1 )
2013-05-03 19:57:16 +02:00
return [ {
' id ' : video_id ,
' url ' : video_url ,
' ext ' : video_extension ,
' title ' : video_title ,
} ]
2013-03-12 01:08:54 +01:00
2013-01-01 19:37:07 +01:00
def gen_extractors ( ) :
""" Return a list of an instance of every supported extractor.
The order does matter ; the first extractor matched is the one handling the URL .
"""
return [
YoutubePlaylistIE ( ) ,
YoutubeChannelIE ( ) ,
YoutubeUserIE ( ) ,
YoutubeSearchIE ( ) ,
YoutubeIE ( ) ,
MetacafeIE ( ) ,
DailymotionIE ( ) ,
GoogleSearchIE ( ) ,
PhotobucketIE ( ) ,
YahooIE ( ) ,
YahooSearchIE ( ) ,
DepositFilesIE ( ) ,
FacebookIE ( ) ,
BlipTVUserIE ( ) ,
BlipTVIE ( ) ,
VimeoIE ( ) ,
MyVideoIE ( ) ,
ComedyCentralIE ( ) ,
EscapistIE ( ) ,
CollegeHumorIE ( ) ,
XVideosIE ( ) ,
2013-03-24 02:24:07 +01:00
SoundcloudSetIE ( ) ,
2013-01-01 19:37:07 +01:00
SoundcloudIE ( ) ,
InfoQIE ( ) ,
MixcloudIE ( ) ,
StanfordOpenClassroomIE ( ) ,
MTVIE ( ) ,
YoukuIE ( ) ,
XNXXIE ( ) ,
2013-01-06 21:52:33 +01:00
YouJizzIE ( ) ,
PornotubeIE ( ) ,
YouPornIE ( ) ,
2013-01-01 19:37:07 +01:00
GooglePlusIE ( ) ,
ArteTvIE ( ) ,
NBAIE ( ) ,
2013-03-07 06:09:55 +01:00
WorldStarHipHopIE ( ) ,
2013-01-01 19:37:07 +01:00
JustinTVIE ( ) ,
FunnyOrDieIE ( ) ,
SteamIE ( ) ,
UstreamIE ( ) ,
2013-01-12 17:58:39 +01:00
RBMARadioIE ( ) ,
2013-01-27 03:01:23 +01:00
EightTracksIE ( ) ,
2013-02-08 08:25:55 +01:00
KeekIE ( ) ,
2013-02-17 17:13:06 +01:00
TEDIE ( ) ,
2013-02-18 18:45:09 +01:00
MySpassIE ( ) ,
2013-03-12 01:08:54 +01:00
SpiegelIE ( ) ,
2013-03-29 15:13:24 +01:00
LiveLeakIE ( ) ,
2013-04-07 15:23:48 +02:00
ARDIE ( ) ,
2013-04-22 21:07:49 +02:00
TumblrIE ( ) ,
2013-05-01 15:55:46 +02:00
BandcampIE ( ) ,
2013-05-03 20:07:35 +02:00
RedTubeIE ( ) ,
2013-01-01 19:37:07 +01:00
GenericIE ( )
]
2013-04-20 12:42:57 +02:00
def get_info_extractor ( ie_name ) :
""" Returns the info extractor class with the given ie_name """
return globals ( ) [ ie_name + ' IE ' ]