[jamendo] improve extraction

- fix album extraction(closes #18564)
- improve metadata extraction(closes #18565)(closes #21379)
This commit is contained in:
Remita Amine 2019-11-04 15:43:52 +01:00
parent bf45295c53
commit e452345fc5
1 changed files with 99 additions and 63 deletions

View File

@ -1,38 +1,26 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re import hashlib
import random
from ..compat import compat_urlparse from ..compat import compat_str
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import parse_duration from ..utils import (
clean_html,
int_or_none,
try_get,
)
class JamendoBaseIE(InfoExtractor): class JamendoIE(InfoExtractor):
def _extract_meta(self, webpage, fatal=True):
title = self._og_search_title(
webpage, default=None) or self._search_regex(
r'<title>([^<]+)', webpage,
'title', default=None)
if title:
title = self._search_regex(
r'(.+?)\s*\|\s*Jamendo Music', title, 'title', default=None)
if not title:
title = self._html_search_meta(
'name', webpage, 'title', fatal=fatal)
mobj = re.search(r'(.+) - (.+)', title or '')
artist, second = mobj.groups() if mobj else [None] * 2
return title, artist, second
class JamendoIE(JamendoBaseIE):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
(?: (?:
licensing\.jamendo\.com/[^/]+| licensing\.jamendo\.com/[^/]+|
(?:www\.)?jamendo\.com (?:www\.)?jamendo\.com
) )
/track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+) /track/(?P<id>[0-9]+)(?:/(?P<display_id>[^/?#&]+))?
''' '''
_TESTS = [{ _TESTS = [{
'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i', 'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i',
@ -45,7 +33,9 @@ class JamendoIE(JamendoBaseIE):
'artist': 'Maya Filipič', 'artist': 'Maya Filipič',
'track': 'Stories from Emona I', 'track': 'Stories from Emona I',
'duration': 210, 'duration': 210,
'thumbnail': r're:^https?://.*\.jpg' 'thumbnail': r're:^https?://.*\.jpg',
'timestamp': 1217438117,
'upload_date': '20080730',
} }
}, { }, {
'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock', 'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock',
@ -53,15 +43,19 @@ class JamendoIE(JamendoBaseIE):
}] }]
def _real_extract(self, url): def _real_extract(self, url):
mobj = self._VALID_URL_RE.match(url) track_id, display_id = self._VALID_URL_RE.match(url).groups()
track_id = mobj.group('id') webpage = self._download_webpage(url, track_id)
display_id = mobj.group('display_id') models = self._parse_json(self._html_search_regex(
r"data-bundled-models='([^']+)",
webpage = self._download_webpage( webpage, 'bundled models'), track_id)
'https://www.jamendo.com/track/%s/%s' % (track_id, display_id), track = models['track']['models'][0]
display_id) title = track_name = track['name']
get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {}
title, artist, track = self._extract_meta(webpage) artist = get_model('artist')
artist_name = artist.get('name')
if artist_name:
title = '%s - %s' % (artist_name, title)
album = get_model('album')
formats = [{ formats = [{
'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294'
@ -77,31 +71,58 @@ class JamendoIE(JamendoBaseIE):
))] ))]
self._sort_formats(formats) self._sort_formats(formats)
thumbnail = self._html_search_meta( urls = []
'image', webpage, 'thumbnail', fatal=False) thumbnails = []
duration = parse_duration(self._search_regex( for _, covers in track.get('cover', {}).items():
r'<span[^>]+itemprop=["\']duration["\'][^>]+content=["\'](.+?)["\']', for cover_id, cover_url in covers.items():
webpage, 'duration', fatal=False)) if not cover_url or cover_url in urls:
continue
urls.append(cover_url)
size = int_or_none(cover_id.lstrip('size'))
thumbnails.append({
'id': cover_id,
'url': cover_url,
'width': size,
'height': size,
})
tags = []
for tag in track.get('tags', []):
tag_name = tag.get('name')
if not tag_name:
continue
tags.append(tag_name)
stats = track.get('stats') or {}
return { return {
'id': track_id, 'id': track_id,
'display_id': display_id, 'display_id': display_id,
'thumbnail': thumbnail, 'thumbnails': thumbnails,
'title': title, 'title': title,
'duration': duration, 'description': track.get('description'),
'artist': artist, 'duration': int_or_none(track.get('duration')),
'track': track, 'artist': artist_name,
'formats': formats 'track': track_name,
'album': album.get('name'),
'formats': formats,
'license': '-'.join(track.get('licenseCC', [])) or None,
'timestamp': int_or_none(track.get('dateCreated')),
'view_count': int_or_none(stats.get('listenedAll')),
'like_count': int_or_none(stats.get('favorited')),
'average_rating': int_or_none(stats.get('averageNote')),
'tags': tags,
} }
class JamendoAlbumIE(JamendoBaseIE): class JamendoAlbumIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)/(?P<display_id>[\w-]+)' _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)'
_TEST = { _TEST = {
'url': 'https://www.jamendo.com/album/121486/duck-on-cover', 'url': 'https://www.jamendo.com/album/121486/duck-on-cover',
'info_dict': { 'info_dict': {
'id': '121486', 'id': '121486',
'title': 'Shearer - Duck On Cover' 'title': 'Duck On Cover',
'description': 'md5:c2920eaeef07d7af5b96d7c64daf1239',
}, },
'playlist': [{ 'playlist': [{
'md5': 'e1a2fcb42bda30dfac990212924149a8', 'md5': 'e1a2fcb42bda30dfac990212924149a8',
@ -111,6 +132,8 @@ class JamendoAlbumIE(JamendoBaseIE):
'title': 'Shearer - Warmachine', 'title': 'Shearer - Warmachine',
'artist': 'Shearer', 'artist': 'Shearer',
'track': 'Warmachine', 'track': 'Warmachine',
'timestamp': 1368089771,
'upload_date': '20130509',
} }
}, { }, {
'md5': '1f358d7b2f98edfe90fd55dac0799d50', 'md5': '1f358d7b2f98edfe90fd55dac0799d50',
@ -120,6 +143,8 @@ class JamendoAlbumIE(JamendoBaseIE):
'title': 'Shearer - Without Your Ghost', 'title': 'Shearer - Without Your Ghost',
'artist': 'Shearer', 'artist': 'Shearer',
'track': 'Without Your Ghost', 'track': 'Without Your Ghost',
'timestamp': 1368089771,
'upload_date': '20130509',
} }
}], }],
'params': { 'params': {
@ -127,24 +152,35 @@ class JamendoAlbumIE(JamendoBaseIE):
} }
} }
def _call_api(self, resource, resource_id):
path = '/api/%ss' % resource
rand = compat_str(random.random())
return self._download_json(
'https://www.jamendo.com' + path, resource_id, query={
'id[]': resource_id,
}, headers={
'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand)
})[0]
def _real_extract(self, url): def _real_extract(self, url):
mobj = self._VALID_URL_RE.match(url) album_id = self._match_id(url)
album_id = mobj.group('id') album = self._call_api('album', album_id)
album_name = album.get('name')
webpage = self._download_webpage(url, mobj.group('display_id')) entries = []
for track in album.get('tracks', []):
title, artist, album = self._extract_meta(webpage, fatal=False) track_id = track.get('id')
if not track_id:
entries = [{ continue
track_id = compat_str(track_id)
entries.append({
'_type': 'url_transparent', '_type': 'url_transparent',
'url': compat_urlparse.urljoin(url, m.group('path')), 'url': 'https://www.jamendo.com/track/' + track_id,
'ie_key': JamendoIE.ie_key(), 'ie_key': JamendoIE.ie_key(),
'id': self._search_regex( 'id': track_id,
r'/track/(\d+)', m.group('path'), 'track id', default=None), 'album': album_name,
'artist': artist, })
'album': album,
} for m in re.finditer(
r'<a[^>]+href=(["\'])(?P<path>(?:(?!\1).)+)\1[^>]+class=["\'][^>]*js-trackrow-albumpage-link',
webpage)]
return self.playlist_result(entries, album_id, title) return self.playlist_result(
entries, album_id, album_name,
clean_html(try_get(album, lambda x: x['description']['en'], compat_str)))