From 575dad3c9842f333c4af27563a26bddaf0015fa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roman=20Le=20N=C3=A9grate?= Date: Sun, 22 Mar 2015 20:25:44 +0100 Subject: [PATCH 1/5] [pornovoisines] Add extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/pornovoisines.py | 101 ++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 youtube_dl/extractor/pornovoisines.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d73826d44b..17d075ec88 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -382,6 +382,7 @@ from .pornhub import ( PornHubPlaylistIE, ) from .pornotube import PornotubeIE +from .pornovoisines import PornoVoisinesIE from .pornoxo import PornoXOIE from .primesharetv import PrimeShareTVIE from .promptfile import PromptFileIE diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py new file mode 100644 index 0000000000..efbb6a8187 --- /dev/null +++ b/youtube_dl/extractor/pornovoisines.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import datetime +import random + +from ..compat import compat_urllib_parse +from .common import InfoExtractor + +class PornoVoisinesIE(InfoExtractor): + _VALID_URL = r'^((?:http://)?(?:www\.)?pornovoisines.com)/showvideo/(\d+)/([^/]+)' + + VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \ + '/static/media/video/transcoded/%s-640x360-1000-trscded.mp4' + + SERVER_NUMBERS = (1, 2) + + _TEST = { + 'url': 'http://www.pornovoisines.com/showvideo/1285/recherche-appartement/', + 'md5': '5ac670803bc12e9e7f9f662ce64cf1d1', + 'info_dict': { + 'id': '1285', + 'display_id': 'recherche-appartement', + 'ext': 'mp4', + 'title': "Recherche appartement", + 'upload_date': '20140925', + 'view_count': int, + 'duration': 120, + 'categories': ["Débutante", "Scénario", "Sodomie"], + 'description': 're:^Pour la .+ original...$', + 'thumbnail': 're:^http://', + 'uploader': "JMTV", + 'average_rating': float, + 'comment_count': int, + 'age_limit': 18, + } + } + + @classmethod + def build_video_url(cls, id): + server_nr = random.choice(cls.SERVER_NUMBERS) + return cls.VIDEO_URL_TEMPLATE % (server_nr, id) + + @staticmethod + def parse_upload_date(str): + return datetime.datetime.strptime(str, "%d-%m-%Y").strftime("%Y%m%d") + + @staticmethod + def parse_categories(str): + return map(lambda s: s.strip(), str.split(',')) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + url_prefix = mobj.group(1) + id = mobj.group(2) + display_id = mobj.group(3) + + webpage = self._download_webpage(url, id) + + title = self._html_search_regex(r'

(.+?)

', webpage, 'title', + flags=re.DOTALL) + url = self.build_video_url(id) + upload_date = self.parse_upload_date( + self._search_regex(r'Publié le (\d\d-\d\d-\d{4})', webpage, + 'upload date')) + view_count = int(self._search_regex(r'(\d+) vues', webpage, 'view count')) + duration = int(self._search_regex('Durée (\d+)', webpage, 'duration')) + categories = self.parse_categories(self._html_search_regex( + r'
  • (.+?)
  • ', webpage, "categories", + flags=re.DOTALL)) + description = self._html_search_regex( + r'
    (.+?)
    ', webpage, "description", + flags=re.DOTALL) + thumbnail = url_prefix + self._html_search_regex(re.compile( + '
    .*?(.+?)', webpage, + "uploader", flags=re.DOTALL)) + average_rating = float(self._search_regex(r'Note : (\d+,\d+)', + webpage, "average rating").replace(',', '.')) + comment_count = int(self._search_regex(r'\((\d+)\)', webpage, + "comment count")) + + return { + 'id': id, + 'display_id': display_id, + 'url': url, + 'title': title, + 'upload_date': upload_date, + 'view_count': view_count, + 'duration': duration, + 'categories': categories, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'average_rating': average_rating, + 'comment_count': comment_count, + 'age_limit': 18, + } From 79c21abba7c9902f00ddac83a2af29c36fe0e122 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 4 Apr 2015 18:45:46 +0600 Subject: [PATCH 2/5] [utils] Add one more template to unified_strdate --- youtube_dl/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e1761265c9..be3f62da70 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -759,6 +759,7 @@ def unified_strdate(date_str, day_first=True): ] if day_first: format_expressions.extend([ + '%d-%m-%Y', '%d.%m.%Y', '%d/%m/%Y', '%d/%m/%y', @@ -766,6 +767,7 @@ def unified_strdate(date_str, day_first=True): ]) else: format_expressions.extend([ + '%m-%d-%Y', '%m.%d.%Y', '%m/%d/%Y', '%m/%d/%y', From 15ac8413c78b991f2e99b6bdc538bc8c5ae8e8a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 4 Apr 2015 19:08:48 +0600 Subject: [PATCH 3/5] [utils] Avoid treating `*-%Y` date template as UTC offset --- youtube_dl/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index be3f62da70..52f0dd09aa 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -730,7 +730,8 @@ def unified_strdate(date_str, day_first=True): # Replace commas date_str = date_str.replace(',', ' ') # %z (UTC offset) is only supported in python>=3.2 - date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) + if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str): + date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) # Remove AM/PM + timezone date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) From 8cf70de428c3fef910ba966fb56d39478226acc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 4 Apr 2015 19:11:01 +0600 Subject: [PATCH 4/5] [test_utils] Add test for unified_strdate --- test/test_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_utils.py b/test/test_utils.py index 4e524aca3b..2e3a6480cb 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -227,6 +227,7 @@ class TestUtil(unittest.TestCase): self.assertEqual( unified_strdate('2/2/2015 6:47:40 PM', day_first=False), '20150202') + self.assertEqual(unified_strdate('25-09-2014'), '20140925') def test_find_xpath_attr(self): testxml = ''' From 7c39a65543b809b681434246b84710349f5837aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 4 Apr 2015 19:13:37 +0600 Subject: [PATCH 5/5] [pornovoisines] Simplify --- youtube_dl/extractor/pornovoisines.py | 111 ++++++++++++-------------- 1 file changed, 53 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py index efbb6a8187..9688ed9489 100644 --- a/youtube_dl/extractor/pornovoisines.py +++ b/youtube_dl/extractor/pornovoisines.py @@ -2,19 +2,23 @@ from __future__ import unicode_literals import re -import datetime import random -from ..compat import compat_urllib_parse from .common import InfoExtractor +from ..utils import ( + int_or_none, + float_or_none, + unified_strdate, +) + class PornoVoisinesIE(InfoExtractor): - _VALID_URL = r'^((?:http://)?(?:www\.)?pornovoisines.com)/showvideo/(\d+)/([^/]+)' + _VALID_URL = r'http://(?:www\.)?pornovoisines\.com/showvideo/(?P\d+)/(?P[^/]+)' - VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \ + _VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \ '/static/media/video/transcoded/%s-640x360-1000-trscded.mp4' - SERVER_NUMBERS = (1, 2) + _SERVER_NUMBERS = (1, 2) _TEST = { 'url': 'http://www.pornovoisines.com/showvideo/1285/recherche-appartement/', @@ -23,79 +27,70 @@ class PornoVoisinesIE(InfoExtractor): 'id': '1285', 'display_id': 'recherche-appartement', 'ext': 'mp4', - 'title': "Recherche appartement", + 'title': 'Recherche appartement', + 'description': 'md5:819ea0b785e2a04667a1a01cdc89594e', + 'thumbnail': 're:^https?://.*\.jpg$', 'upload_date': '20140925', - 'view_count': int, 'duration': 120, - 'categories': ["Débutante", "Scénario", "Sodomie"], - 'description': 're:^Pour la .+ original...$', - 'thumbnail': 're:^http://', - 'uploader': "JMTV", + 'view_count': int, 'average_rating': float, - 'comment_count': int, + 'categories': ['Débutante', 'Scénario', 'Sodomie'], 'age_limit': 18, } } @classmethod - def build_video_url(cls, id): - server_nr = random.choice(cls.SERVER_NUMBERS) - return cls.VIDEO_URL_TEMPLATE % (server_nr, id) - - @staticmethod - def parse_upload_date(str): - return datetime.datetime.strptime(str, "%d-%m-%Y").strftime("%Y%m%d") - - @staticmethod - def parse_categories(str): - return map(lambda s: s.strip(), str.split(',')) + def build_video_url(cls, num): + return cls._VIDEO_URL_TEMPLATE % (random.choice(cls._SERVER_NUMBERS), num) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - url_prefix = mobj.group(1) - id = mobj.group(2) - display_id = mobj.group(3) + video_id = mobj.group('id') + display_id = mobj.group('display_id') - webpage = self._download_webpage(url, id) + webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'

    (.+?)

    ', webpage, 'title', - flags=re.DOTALL) - url = self.build_video_url(id) - upload_date = self.parse_upload_date( - self._search_regex(r'Publié le (\d\d-\d\d-\d{4})', webpage, - 'upload date')) - view_count = int(self._search_regex(r'(\d+) vues', webpage, 'view count')) - duration = int(self._search_regex('Durée (\d+)', webpage, 'duration')) - categories = self.parse_categories(self._html_search_regex( - r'
  • (.+?)
  • ', webpage, "categories", - flags=re.DOTALL)) + video_url = self.build_video_url(video_id) + + title = self._html_search_regex( + r'

    (.+?)

    ', webpage, 'title', flags=re.DOTALL) description = self._html_search_regex( - r'
    (.+?)
    ', webpage, "description", - flags=re.DOTALL) - thumbnail = url_prefix + self._html_search_regex(re.compile( - '
    .*?(.+?)', webpage, - "uploader", flags=re.DOTALL)) - average_rating = float(self._search_regex(r'Note : (\d+,\d+)', - webpage, "average rating").replace(',', '.')) - comment_count = int(self._search_regex(r'\((\d+)\)', webpage, - "comment count")) + r'
    (.+?)
    ', + webpage, "description", fatal=False, flags=re.DOTALL) + + thumbnail = self._search_regex( + r'
    \s*