[ie/nhk] Improve metadata extraction (#8388)

Authored by: garret1317
This commit is contained in:
garret 2023-11-11 19:59:01 +00:00 committed by GitHub
parent 05adfd883a
commit 54579be436
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 82 additions and 20 deletions

View File

@ -3,6 +3,8 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
clean_html,
get_element_by_class,
int_or_none, int_or_none,
join_nonempty, join_nonempty,
parse_duration, parse_duration,
@ -45,25 +47,36 @@ class NhkBaseIE(InfoExtractor):
self.cache.store('nhk', 'api_info', api_info) self.cache.store('nhk', 'api_info', api_info)
return api_info return api_info
def _extract_formats_and_subtitles(self, vod_id): def _extract_stream_info(self, vod_id):
for refresh in (False, True): for refresh in (False, True):
api_info = self._get_api_info(refresh) api_info = self._get_api_info(refresh)
if not api_info: if not api_info:
continue continue
api_url = api_info.pop('url') api_url = api_info.pop('url')
stream_url = traverse_obj( meta = traverse_obj(
self._download_json( self._download_json(
api_url, vod_id, 'Downloading stream url info', fatal=False, query={ api_url, vod_id, 'Downloading stream url info', fatal=False, query={
**api_info, **api_info,
'type': 'json', 'type': 'json',
'optional_id': vod_id, 'optional_id': vod_id,
'active_flg': 1, 'active_flg': 1,
}), }), ('meta', 0))
('meta', 0, 'movie_url', ('mb_auto', 'auto_sp', 'auto_pc'), {url_or_none}), get_all=False) stream_url = traverse_obj(
if stream_url: meta, ('movie_url', ('mb_auto', 'auto_sp', 'auto_pc'), {url_or_none}), get_all=False)
return self._extract_m3u8_formats_and_subtitles(stream_url, vod_id)
if stream_url:
formats, subtitles = self._extract_m3u8_formats_and_subtitles(stream_url, vod_id)
return {
**traverse_obj(meta, {
'duration': ('duration', {int_or_none}),
'timestamp': ('publication_date', {unified_timestamp}),
'release_timestamp': ('insert_date', {unified_timestamp}),
'modified_timestamp': ('update_date', {unified_timestamp}),
}),
'formats': formats,
'subtitles': subtitles,
}
raise ExtractorError('Unable to extract stream url') raise ExtractorError('Unable to extract stream url')
def _extract_episode_info(self, url, episode=None): def _extract_episode_info(self, url, episode=None):
@ -77,11 +90,11 @@ class NhkBaseIE(InfoExtractor):
if fetch_episode: if fetch_episode:
episode = self._call_api( episode = self._call_api(
episode_id, lang, is_video, True, episode_id[:4] == '9999')[0] episode_id, lang, is_video, True, episode_id[:4] == '9999')[0]
title = episode.get('sub_title_clean') or episode['sub_title']
def get_clean_field(key): def get_clean_field(key):
return episode.get(key + '_clean') or episode.get(key) return clean_html(episode.get(key + '_clean') or episode.get(key))
title = get_clean_field('sub_title')
series = get_clean_field('title') series = get_clean_field('title')
thumbnails = [] thumbnails = []
@ -96,22 +109,30 @@ class NhkBaseIE(InfoExtractor):
'url': 'https://www3.nhk.or.jp' + img_path, 'url': 'https://www3.nhk.or.jp' + img_path,
}) })
episode_name = title
if series and title:
title = f'{series} - {title}'
elif series and not title:
title = series
series = None
episode_name = None
else: # title, no series
episode_name = None
info = { info = {
'id': episode_id + '-' + lang, 'id': episode_id + '-' + lang,
'title': '%s - %s' % (series, title) if series and title else title, 'title': title,
'description': get_clean_field('description'), 'description': get_clean_field('description'),
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'series': series, 'series': series,
'episode': title, 'episode': episode_name,
} }
if is_video: if is_video:
vod_id = episode['vod_id'] vod_id = episode['vod_id']
formats, subs = self._extract_formats_and_subtitles(vod_id)
info.update({ info.update({
**self._extract_stream_info(vod_id),
'id': vod_id, 'id': vod_id,
'formats': formats,
'subtitles': subs,
}) })
else: else:
@ -148,6 +169,14 @@ class NhkVodIE(NhkBaseIE):
'thumbnail': 'md5:51bcef4a21936e7fea1ff4e06353f463', 'thumbnail': 'md5:51bcef4a21936e7fea1ff4e06353f463',
'episode': 'The Tohoku Shinkansen: Full Speed Ahead', 'episode': 'The Tohoku Shinkansen: Full Speed Ahead',
'series': 'Japan Railway Journal', 'series': 'Japan Railway Journal',
'modified_timestamp': 1694243656,
'timestamp': 1681428600,
'release_timestamp': 1693883728,
'duration': 1679,
'upload_date': '20230413',
'modified_date': '20230909',
'release_date': '20230905',
}, },
}, { }, {
# video clip # video clip
@ -161,6 +190,13 @@ class NhkVodIE(NhkBaseIE):
'thumbnail': 'md5:d6a4d9b6e9be90aaadda0bcce89631ed', 'thumbnail': 'md5:d6a4d9b6e9be90aaadda0bcce89631ed',
'series': 'Dining with the Chef', 'series': 'Dining with the Chef',
'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU', 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU',
'duration': 148,
'upload_date': '20190816',
'release_date': '20230902',
'release_timestamp': 1693619292,
'modified_timestamp': 1694168033,
'modified_date': '20230908',
'timestamp': 1565997540,
}, },
}, { }, {
# radio # radio
@ -170,7 +206,7 @@ class NhkVodIE(NhkBaseIE):
'ext': 'm4a', 'ext': 'm4a',
'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines', 'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines',
'series': 'Living in Japan', 'series': 'Living in Japan',
'description': 'md5:850611969932874b4a3309e0cae06c2f', 'description': 'md5:0a0e2077d8f07a03071e990a6f51bfab',
'thumbnail': 'md5:960622fb6e06054a4a1a0c97ea752545', 'thumbnail': 'md5:960622fb6e06054a4a1a0c97ea752545',
'episode': 'Tips for Travelers to Japan / Ramen Vending Machines' 'episode': 'Tips for Travelers to Japan / Ramen Vending Machines'
}, },
@ -212,6 +248,23 @@ class NhkVodIE(NhkBaseIE):
'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0', 'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0',
}, },
'skip': 'expires 2023-10-15', 'skip': 'expires 2023-10-15',
}, {
# a one-off (single-episode series). title from the api is just '<p></p>'
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/3004952/',
'info_dict': {
'id': 'nw_vod_v_en_3004_952_20230723091000_01_1690074552',
'ext': 'mp4',
'title': 'Barakan Discovers AMAMI OSHIMA: Isson\'s Treasure Island',
'description': 'md5:5db620c46a0698451cc59add8816b797',
'thumbnail': 'md5:67d9ff28009ba379bfa85ad1aaa0e2bd',
'release_date': '20230905',
'timestamp': 1690103400,
'duration': 2939,
'release_timestamp': 1693898699,
'modified_timestamp': 1698057495,
'modified_date': '20231023',
'upload_date': '20230723',
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -226,13 +279,15 @@ class NhkVodProgramIE(NhkBaseIE):
'info_dict': { 'info_dict': {
'id': 'sumo', 'id': 'sumo',
'title': 'GRAND SUMO Highlights', 'title': 'GRAND SUMO Highlights',
'description': 'md5:fc20d02dc6ce85e4b72e0273aa52fdbf',
}, },
'playlist_mincount': 12, 'playlist_mincount': 0,
}, { }, {
'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway',
'info_dict': { 'info_dict': {
'id': 'japanrailway', 'id': 'japanrailway',
'title': 'Japan Railway Journal', 'title': 'Japan Railway Journal',
'description': 'md5:ea39d93af7d05835baadf10d1aae0e3f',
}, },
'playlist_mincount': 12, 'playlist_mincount': 12,
}, { }, {
@ -241,6 +296,7 @@ class NhkVodProgramIE(NhkBaseIE):
'info_dict': { 'info_dict': {
'id': 'japanrailway', 'id': 'japanrailway',
'title': 'Japan Railway Journal', 'title': 'Japan Railway Journal',
'description': 'md5:ea39d93af7d05835baadf10d1aae0e3f',
}, },
'playlist_mincount': 5, 'playlist_mincount': 5,
}, { }, {
@ -265,11 +321,11 @@ class NhkVodProgramIE(NhkBaseIE):
entries.append(self._extract_episode_info( entries.append(self._extract_episode_info(
urljoin(url, episode_path), episode)) urljoin(url, episode_path), episode))
program_title = None html = self._download_webpage(url, program_id)
if entries: program_title = clean_html(get_element_by_class('p-programDetail__title', html))
program_title = entries[0].get('series') program_description = clean_html(get_element_by_class('p-programDetail__text', html))
return self.playlist_result(entries, program_id, program_title) return self.playlist_result(entries, program_id, program_title, program_description)
class NhkForSchoolBangumiIE(InfoExtractor): class NhkForSchoolBangumiIE(InfoExtractor):
@ -421,6 +477,7 @@ class NhkRadiruIE(InfoExtractor):
'skip': 'Episode expired on 2023-04-16', 'skip': 'Episode expired on 2023-04-16',
'info_dict': { 'info_dict': {
'channel': 'NHK-FM', 'channel': 'NHK-FM',
'uploader': 'NHK-FM',
'description': 'md5:94b08bdeadde81a97df4ec882acce3e9', 'description': 'md5:94b08bdeadde81a97df4ec882acce3e9',
'ext': 'm4a', 'ext': 'm4a',
'id': '0449_01_3853544', 'id': '0449_01_3853544',
@ -441,6 +498,7 @@ class NhkRadiruIE(InfoExtractor):
'title': 'ベストオブクラシック', 'title': 'ベストオブクラシック',
'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。', 'description': '世界中の上質な演奏会をじっくり堪能する本格派クラシック番組。',
'channel': 'NHK-FM', 'channel': 'NHK-FM',
'uploader': 'NHK-FM',
'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg', 'thumbnail': 'https://www.nhk.or.jp/prog/img/458/g458.jpg',
}, },
'playlist_mincount': 3, 'playlist_mincount': 3,
@ -454,6 +512,7 @@ class NhkRadiruIE(InfoExtractor):
'title': '有島武郎「一房のぶどう」', 'title': '有島武郎「一房のぶどう」',
'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より', 'description': '朗読:川野一宇(ラジオ深夜便アンカー)\r\n\r\n2016年12月8日放送「ラジオ深夜便『アンカー朗読シリーズ』」より',
'channel': 'NHKラジオ第1、NHK-FM', 'channel': 'NHKラジオ第1、NHK-FM',
'uploader': 'NHKラジオ第1、NHK-FM',
'timestamp': 1635757200, 'timestamp': 1635757200,
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg', 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F300/img/corner/box_109_thumbnail.jpg',
'release_date': '20161207', 'release_date': '20161207',
@ -469,6 +528,7 @@ class NhkRadiruIE(InfoExtractor):
'id': 'F261_01_3855109', 'id': 'F261_01_3855109',
'ext': 'm4a', 'ext': 'm4a',
'channel': 'NHKラジオ第1', 'channel': 'NHKラジオ第1',
'uploader': 'NHKラジオ第1',
'timestamp': 1681635900, 'timestamp': 1681635900,
'release_date': '20230416', 'release_date': '20230416',
'series': 'NHKラジオニュース', 'series': 'NHKラジオニュース',
@ -513,6 +573,7 @@ class NhkRadiruIE(InfoExtractor):
series_meta = traverse_obj(meta, { series_meta = traverse_obj(meta, {
'title': 'program_name', 'title': 'program_name',
'channel': 'media_name', 'channel': 'media_name',
'uploader': 'media_name',
'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}), 'thumbnail': (('thumbnail_c', 'thumbnail_p'), {url_or_none}),
}, get_all=False) }, get_all=False)
@ -541,6 +602,7 @@ class NhkRadioNewsPageIE(InfoExtractor):
'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg', 'thumbnail': 'https://www.nhk.or.jp/radioondemand/json/F261/img/RADIONEWS_640.jpg',
'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d', 'description': 'md5:bf2c5b397e44bc7eb26de98d8f15d79d',
'channel': 'NHKラジオ第1', 'channel': 'NHKラジオ第1',
'uploader': 'NHKラジオ第1',
'title': 'NHKラジオニュース', 'title': 'NHKラジオニュース',
} }
}] }]