From 2cf8003638ef76a0f76541229ecab1adf739a3ae Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 19 Nov 2020 17:29:30 +0100 Subject: [PATCH] [amara] improve extraction --- youtube_dl/extractor/amara.py | 143 ++++++++++++++++++++-------------- 1 file changed, 85 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/amara.py b/youtube_dl/extractor/amara.py index b222154bd..61d469574 100644 --- a/youtube_dl/extractor/amara.py +++ b/youtube_dl/extractor/amara.py @@ -1,76 +1,103 @@ # coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor +from .youtube import YoutubeIE +from .vimeo import VimeoIE +from ..utils import ( + int_or_none, + parse_iso8601, + update_url_query, +) class AmaraIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P\w+)' - _TESTS = [ - { - 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', - 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', - 'info_dict': { - 'id': 'h6ZuVdvYnfE', - 'ext': 'mp4', - 'title': 'Why jury trials are becoming less common', - 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': dict, - 'upload_date': '20160813', - 'uploader': 'PBS NewsHour', - 'uploader_id': 'PBSNewsHour' - } - }, - { - 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', - 'md5': '99392c75fa05d432a8f11df03612195e', - 'info_dict': { - 'id': '18622084', - 'ext': 'mov', - 'title': 'Vimeo at CES 2011!', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': dict, - 'timestamp': 1294649110, - 'upload_date': '20110110', - 'uploader': 'Sam Morrill', - 'uploader_id': 'sammorrill' - } - }, - { - 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', - 'md5': 'd3970f08512738ee60c5807311ff5d3f', - 'info_dict': { - 'id': 'ChimamandaAdichie_2009G-transcript', - 'ext': 'mp4', - 'title': 'The danger of a single story', - 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': dict, - 'upload_date': '20131206' - } + _TESTS = [{ + # Youtube + 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', + 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', + 'info_dict': { + 'id': 'h6ZuVdvYnfE', + 'ext': 'mp4', + 'title': 'Why jury trials are becoming less common', + 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20160813', + 'uploader': 'PBS NewsHour', + 'uploader_id': 'PBSNewsHour', + 'timestamp': 1549639570, } - ] - - def get_subtitles_for_language(self, language): - return [{ - 'ext': type, - 'url': language['subtitles_uri'].replace('format=json', 'format=' + type) - } for type in ['vtt', 'srt', 'json']] + }, { + # Vimeo + 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', + 'md5': '99392c75fa05d432a8f11df03612195e', + 'info_dict': { + 'id': '18622084', + 'ext': 'mov', + 'title': 'Vimeo at CES 2011!', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'timestamp': 1294763658, + 'upload_date': '20110111', + 'uploader': 'Sam Morrill', + 'uploader_id': 'sammorrill' + } + }, { + # Direct Link + 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', + 'md5': 'd3970f08512738ee60c5807311ff5d3f', + 'info_dict': { + 'id': 's8KL7I3jLmh6', + 'ext': 'mp4', + 'title': 'The danger of a single story', + 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20091007', + 'timestamp': 1254942511, + } + }] def _real_extract(self, url): video_id = self._match_id(url) - meta = self._download_json('https://amara.org/api/videos/%s/' % video_id, video_id, query={'format': 'json'}) + meta = self._download_json( + 'https://amara.org/api/videos/%s/' % video_id, + video_id, query={'format': 'json'}) + title = meta['title'] + video_url = meta['all_urls'][0] - video_url = meta.get('all_urls')[0] - subtitles = dict([(language['code'], self.get_subtitles_for_language(language)) for language in meta.get('languages', []) if language['published']]) + subtitles = {} + for language in (meta.get('languages') or []): + subtitles_uri = language.get('subtitles_uri') + if not (subtitles_uri and language.get('published')): + continue + subtitle = subtitles.setdefault(language.get('code') or 'en', []) + for f in ('json', 'srt', 'vtt'): + subtitle.append({ + 'ext': f, + 'url': update_url_query(subtitles_uri, {'format': f}), + }) - return { - '_type': 'url_transparent', + info = { 'url': video_url, 'id': video_id, 'subtitles': subtitles, - 'title': meta['title'], + 'title': title, 'description': meta.get('description'), - 'thumbnail': meta.get('thumbnail') + 'thumbnail': meta.get('thumbnail'), + 'duration': int_or_none(meta.get('duration')), + 'timestamp': parse_iso8601(meta.get('created')), } + + for ie in (YoutubeIE, VimeoIE): + if ie.suitable(video_url): + info.update({ + '_type': 'url_transparent', + 'ie_key': ie.ie_key(), + }) + break + + return info