[extractor/common] Extract f4m and m3u8 formats, subtitles and info

This commit is contained in:
Sergey M․ 2015-08-02 01:13:21 +06:00
parent 3f125c8c70
commit a107193e4b
1 changed files with 149 additions and 51 deletions

View File

@ -18,6 +18,7 @@ from ..compat import (
compat_HTTPError, compat_HTTPError,
compat_http_client, compat_http_client,
compat_urllib_error, compat_urllib_error,
compat_urllib_parse,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
compat_urllib_request, compat_urllib_request,
compat_urlparse, compat_urlparse,
@ -37,6 +38,7 @@ from ..utils import (
RegexNotFoundError, RegexNotFoundError,
sanitize_filename, sanitize_filename,
unescapeHTML, unescapeHTML,
url_basename,
) )
@ -978,69 +980,165 @@ class InfoExtractor(object):
self._sort_formats(formats) self._sort_formats(formats)
return formats return formats
# TODO: improve extraction @staticmethod
def _extract_smil_formats(self, smil_url, video_id, fatal=True): def _xpath_ns(path, namespace=None):
smil = self._download_xml( if not namespace:
smil_url, video_id, 'Downloading SMIL file', return path
'Unable to download SMIL file', fatal=fatal) out = []
for c in path.split('/'):
if not c or c == '.':
out.append(c)
else:
out.append('{%s}%s' % (namespace, c))
return '/'.join(out)
def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
smil = self._download_smil(smil_url, video_id, fatal=fatal)
if smil is False: if smil is False:
assert not fatal assert not fatal
return [] return []
base = smil.find('./head/meta').get('base') namespace = self._search_regex(
r'{([^}]+)?}smil', smil.tag, 'namespace', default=None)
return self._parse_smil_formats(
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
smil = self._download_smil(smil_url, video_id, fatal=fatal)
if smil is False:
return {}
return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
def _download_smil(self, smil_url, video_id, fatal=True):
return self._download_xml(
smil_url, video_id, 'Downloading SMIL file',
'Unable to download SMIL file', fatal=fatal)
def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
namespace = self._search_regex(
r'{([^}]+)?}smil', smil.tag, 'namespace', default=None)
formats = self._parse_smil_formats(
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
video_id = os.path.splitext(url_basename(smil_url))[0]
title = None
description = None
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
name = meta.attrib.get('name')
content = meta.attrib.get('content')
if not name or not content:
continue
if not title and name == 'title':
title = content
elif not description and name in ('description', 'abstract'):
description = content
return {
'id': video_id,
'title': title or video_id,
'description': description,
'formats': formats,
'subtitles': subtitles,
}
def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
base = smil_url
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
b = meta.get('base') or meta.get('httpBase')
if b:
base = b
break
formats = [] formats = []
rtmp_count = 0 rtmp_count = 0
if smil.findall('./body/seq/video'): http_count = 0
video = smil.findall('./body/seq/video')[0]
fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
formats.extend(fmts)
else:
for video in smil.findall('./body/switch/video'):
fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
formats.extend(fmts)
self._sort_formats(formats) videos = smil.findall(self._xpath_ns('.//video', namespace))
for video in videos:
return formats
def _parse_smil_video(self, video, video_id, base, rtmp_count):
src = video.get('src') src = video.get('src')
if not src: if not src:
return [], rtmp_count continue
bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
filesize = int_or_none(video.get('size') or video.get('fileSize'))
width = int_or_none(video.get('width')) width = int_or_none(video.get('width'))
height = int_or_none(video.get('height')) height = int_or_none(video.get('height'))
proto = video.get('proto') proto = video.get('proto')
if not proto:
if base:
if base.startswith('rtmp'):
proto = 'rtmp'
elif base.startswith('http'):
proto = 'http'
ext = video.get('ext') ext = video.get('ext')
if proto == 'm3u8': src_ext = determine_ext(src)
return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
elif proto == 'rtmp':
rtmp_count += 1
streamer = video.get('streamer') or base streamer = video.get('streamer') or base
return ([{
if proto == 'rtmp' or streamer.startswith('rtmp'):
rtmp_count += 1
formats.append({
'url': streamer, 'url': streamer,
'play_path': src, 'play_path': src,
'ext': 'flv', 'ext': 'flv',
'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
'tbr': bitrate, 'tbr': bitrate,
'filesize': filesize,
'width': width, 'width': width,
'height': height, 'height': height,
}], rtmp_count) })
elif proto.startswith('http'): continue
return ([{
'url': base + src, src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
'ext': ext or 'flv',
if proto == 'm3u8' or src_ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
src_url, video_id, ext or 'mp4', m3u8_id='hls'))
continue
if src_ext == 'f4m':
f4m_url = src_url
if not f4m_params:
f4m_params = {
'hdcore': '3.2.0',
'plugin': 'flowplayer-3.2.0.1',
}
f4m_url += '&' if '?' in f4m_url else '?'
f4m_url += compat_urllib_parse.urlencode(f4m_params).encode('utf-8')
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
continue
if src_url.startswith('http'):
http_count += 1
formats.append({
'url': src_url,
'ext': ext or src_ext or 'flv',
'format_id': 'http-%d' % (bitrate or http_count),
'tbr': bitrate, 'tbr': bitrate,
'filesize': filesize,
'width': width, 'width': width,
'height': height, 'height': height,
}], rtmp_count) })
continue
self._sort_formats(formats)
return formats
def _parse_smil_subtitles(self, smil, namespace=None):
subtitles = {}
for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
src = textstream.get('src')
if not src:
continue
ext = textstream.get('ext') or determine_ext(src)
if not ext:
type_ = textstream.get('type')
if type_ == 'text/srt':
ext = 'srt'
lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
subtitles.setdefault(lang, []).append({
'url': src,
'ext': ext,
})
return subtitles
def _live_title(self, name): def _live_title(self, name):
""" Generate the title for a live video """ """ Generate the title for a live video """