From 4fc946b546c2a471774646f7da291105f8a0cb99 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 20 Feb 2014 13:14:05 +0100 Subject: [PATCH] [generic] Add support for RSS feeds (Fixes #667) --- test/test_playlists.py | 9 +++++++++ youtube_dl/extractor/generic.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/test/test_playlists.py b/test/test_playlists.py index 1de9e8ec1a..25bec9f1c6 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -250,5 +250,14 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], 'python language') self.assertTrue(len(result['entries']) == 15) + def test_generic_rss_feed(self): + dl = FakeYDL() + ie = GenericIE(dl) + result = ie.extract('http://www.escapistmagazine.com/rss/videos/list/1.xml') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], 'http://www.escapistmagazine.com/rss/videos/list/1.xml') + self.assertEqual(result['title'], 'Zero Punctuation') + self.assertTrue(len(result['entries']) > 10) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5bcc78bf79..30160d59d4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import os import re +import xml.etree.ElementTree from .common import InfoExtractor from .youtube import YoutubeIE @@ -159,6 +160,25 @@ class GenericIE(InfoExtractor): raise ExtractorError('Invalid URL protocol') return response + def _extract_rss(self, url, video_id, doc): + playlist_title = doc.find('./channel/title').text + playlist_desc_el = doc.find('./channel/description') + playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text + + entries = [{ + '_type': 'url', + 'url': e.find('link').text, + 'title': e.find('title').text, + } for e in doc.findall('./channel/item')] + + return { + '_type': 'playlist', + 'id': url, + 'title': playlist_title, + 'description': playlist_desc, + 'entries': entries, + } + def _real_extract(self, url): parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: @@ -219,6 +239,14 @@ class GenericIE(InfoExtractor): self.report_extraction(video_id) + # Is it an RSS feed? + try: + doc = xml.etree.ElementTree.fromstring(webpage) + if doc.tag == 'rss': + return self._extract_rss(url, video_id, doc) + except xml.etree.ElementTree.ParseError: + pass + # it's tempting to parse this further, but you would # have to take into account all the variations like # Video Title - Site Name