from __future__ import unicode_literals import base64 import functools import itertools import re from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_chr, compat_ord, compat_str, compat_urllib_parse_unquote, compat_urlparse, compat_zip ) from ..utils import ( clean_html, ExtractorError, int_or_none, OnDemandPagedList, str_to_int, try_get, urljoin, ) class MixcloudIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)' IE_NAME = 'mixcloud' _TESTS = [{ 'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/', 'info_dict': { 'id': 'dholbach-cryptkeeper', 'ext': 'm4a', 'title': 'Cryptkeeper', 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', 'uploader': 'Daniel Holbach', 'uploader_id': 'dholbach', 'thumbnail': r're:https?://.*\.jpg', 'view_count': int, }, }, { 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', 'info_dict': { 'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat', 'ext': 'mp3', 'title': 'Caribou 7 inch Vinyl Mix & Chat', 'description': 'md5:2b8aec6adce69f9d41724647c65875e8', 'uploader': 'Gilles Peterson Worldwide', 'uploader_id': 'gillespeterson', 'thumbnail': 're:https?://.*', 'view_count': int, }, }, { 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/', 'only_matching': True, }] @staticmethod def _decrypt_xor_cipher(key, ciphertext): """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR.""" return ''.join([ compat_chr(compat_ord(ch) ^ compat_ord(k)) for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group(1) cloudcast_name = mobj.group(2) track_id = compat_urllib_parse_unquote('-'.join((uploader, cloudcast_name))) webpage = self._download_webpage(url, track_id) # Legacy path encrypted_play_info = self._search_regex( r'm-play-info="([^"]+)"', webpage, 'play info', default=None) if encrypted_play_info is not None: # Decode encrypted_play_info = compat_b64decode(encrypted_play_info) else: # New path full_info_json = self._parse_json(self._html_search_regex( r'', webpage, 'play info'), 'play info') for item in full_info_json: item_data = try_get( item, lambda x: x['cloudcast']['data']['cloudcastLookup'], dict) if try_get(item_data, lambda x: x['streamInfo']['url']): info_json = item_data break else: raise ExtractorError('Failed to extract matching stream info') message = self._html_search_regex( r'(?s)