[EsriVideo] Add new extractor

Add extractor for [videos.esri.com](https://videos.esri.com), a collection of videos relating to GIS.
2015-04-18 00:37:04 -04:00 · 2015-04-18 00:37:04 -04:00 · 8b8c1093b6
parent d0d6c097fc
commit 8b8c1093b6
2 changed files with 91 additions and 0 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -695,6 +695,7 @@ from .vice import ViceIE
 from .viddler import ViddlerIE
 from .videobam import VideoBamIE
 from .videodetective import VideoDetectiveIE
 from .videoesri import VideoEsriIE
 from .videolecturesnet import VideoLecturesNetIE
 from .videofyme import VideofyMeIE
 from .videomega import VideoMegaIE
--- a/youtube_dl/extractor/videoesri.py
+++ b/youtube_dl/extractor/videoesri.py
@ -0,0 +1,90 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import os
 import re
 from .common import InfoExtractor
 from ..utils import (
    unified_strdate
 )
 class VideoEsriIE(InfoExtractor):
    _VALID_URL = r'https?://video\.esri\.com/watch/(?P<id>[0-9]+)'
    _TEST = {
        'url': 'https://video.esri.com/watch/4228',
        'md5': '170b4d513c2466ed483c150a48384133',
        'info_dict': {
            'id': '4228',
            'ext': 'mp4',
            'title': 'AppStudio for ArcGIS',
            'thumbnail': 're:^https?://.*\.jpg$',
            'upload_date': '20150310',
        }
    }
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
        upload_date_raw = self._search_regex(
            r'http-equiv="last-modified" content="(.*)"',
            webpage, 'upload date')
        upload_date = unified_strdate(upload_date_raw)
        settings_info = self._search_regex(
            r'evPlayerSettings = {(.*?);\s*$',
            webpage, 'settings info', flags=re.MULTILINE | re.DOTALL)
        # thumbnail includes '_x' for large, also has {_m,_t,_s} or
        # without size suffix returns full image
        thumbnail_path = re.findall(
            r'image\': \'(\/thumbs.*)\'',
            settings_info)[0]
        if thumbnail_path:
            thumbnail = '/'.join(['http://video.esri.com', thumbnail_path])
        # note that this misses the (exceedly rare) webm files
        video_paths = re.findall(r'mp4:(.*)\'', settings_info)
        # find possible http servers of the mp4 files (also has rtsp)
        base_url = re.findall(
            r'netstreambasepath\':\s\'(h.*)\'', settings_info)[0]
        # these are the numbers used internally, but really map
        # to other resolutions, e.g. 960 is 720p.
        heights = [480, 720, 960]
        videos_by_res = {}
        for video_path in video_paths:
            url = "{base_url}{video_path}".format(
                base_url=base_url,
                video_path=video_path)
            filename, ext = os.path.splitext(video_path)
            height_label = int(filename.split('_')[1])
            videos_by_res[height_label] = {
                'url': url,
                'ext': ext[1:],
                'protocol': 'http',  # http-only supported currently
            }
        formats = []
        for height in heights:
            if height in videos_by_res:
                formats.append(videos_by_res[height])
        result = {
            'id': video_id,
            'title': title,
            'upload_date': upload_date,
            'formats': formats,
        }
        if thumbnail:
            result['thumbnail'] = thumbnail
        return result