[extractor/vrt] Overhaul extractors (#6244)

* Fixes `VrtNU` extractor to work with the VRT MAX site change
* Adapts `VRT`, `Ketnet` and `DagelijkseKost` extractors to the new VRT API
* Removes `Canvas` and `CanvasEen` extractors; the sites and API no longer exist
* Moves all remaining VRT-related extractors into the `vrt` module

Closes #4908
Authored by: jeroenj, bergoid, bashonly

Co-authored-by: bergoid <bergoid@users.noreply.github.com>
Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com>
This commit is contained in:
Jeroen Jacobs 2023-06-02 20:29:00 +02:00 committed by GitHub
parent 55ed4ff734
commit 1a7dcca378
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 384 additions and 496 deletions

View File

@ -295,12 +295,6 @@ from .camwithher import CamWithHerIE
from .canalalpha import CanalAlphaIE from .canalalpha import CanalAlphaIE
from .canalplus import CanalplusIE from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE from .canalc2 import Canalc2IE
from .canvas import (
CanvasIE,
CanvasEenIE,
VrtNUIE,
DagelijkseKostIE,
)
from .carambatv import ( from .carambatv import (
CarambaTVIE, CarambaTVIE,
CarambaTVPageIE, CarambaTVPageIE,
@ -894,7 +888,6 @@ from .karaoketv import KaraoketvIE
from .karrierevideos import KarriereVideosIE from .karrierevideos import KarriereVideosIE
from .keezmovies import KeezMoviesIE from .keezmovies import KeezMoviesIE
from .kelbyone import KelbyOneIE from .kelbyone import KelbyOneIE
from .ketnet import KetnetIE
from .khanacademy import ( from .khanacademy import (
KhanAcademyIE, KhanAcademyIE,
KhanAcademyUnitIE, KhanAcademyUnitIE,
@ -2285,7 +2278,12 @@ from .voxmedia import (
VoxMediaVolumeIE, VoxMediaVolumeIE,
VoxMediaIE, VoxMediaIE,
) )
from .vrt import VRTIE from .vrt import (
VRTIE,
VrtNUIE,
KetnetIE,
DagelijkseKostIE,
)
from .vrak import VrakIE from .vrak import VrakIE
from .vrv import ( from .vrv import (
VRVIE, VRVIE,

View File

@ -1,383 +0,0 @@
import json
from .common import InfoExtractor
from .gigya import GigyaBaseIE
from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
clean_html,
extract_attributes,
float_or_none,
get_element_by_class,
int_or_none,
merge_dicts,
str_or_none,
strip_or_none,
url_or_none,
urlencode_postdata
)
class CanvasIE(InfoExtractor):
_VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9',
'info_dict': {
'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'ext': 'mp4',
'title': 'Nachtwacht: De Greystook',
'description': 'Nachtwacht: De Greystook',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1468.02,
},
'expected_warnings': ['is not a supported codec'],
}, {
'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
'only_matching': True,
}]
_GEO_BYPASS = False
_HLS_ENTRY_PROTOCOLS_MAP = {
'HLS': 'm3u8_native',
'HLS_AES': 'm3u8_native',
}
_REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2'
def _real_extract(self, url):
mobj = self._match_valid_url(url)
site_id, video_id = mobj.group('site_id'), mobj.group('id')
data = None
if site_id != 'vrtvideo':
# Old API endpoint, serves more formats but may fail for some videos
data = self._download_json(
'https://mediazone.vrt.be/api/v1/%s/assets/%s'
% (site_id, video_id), video_id, 'Downloading asset JSON',
'Unable to download asset JSON', fatal=False)
# New API endpoint
if not data:
vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken',
video_id, note='refreshtoken: Retrieve vrtnutoken',
errnote='refreshtoken failed')['vrtnutoken']
headers = self.geo_verification_headers()
headers.update({'Content-Type': 'application/json; charset=utf-8'})
vrtPlayerToken = self._download_json(
'%s/tokens' % self._REST_API_BASE, video_id,
'Downloading token', headers=headers, data=json.dumps({
'identityToken': vrtnutoken
}).encode('utf-8'))['vrtPlayerToken']
data = self._download_json(
'%s/videos/%s' % (self._REST_API_BASE, video_id),
video_id, 'Downloading video JSON', query={
'vrtPlayerToken': vrtPlayerToken,
'client': 'null',
}, expected_status=400)
if 'title' not in data:
code = data.get('code')
if code == 'AUTHENTICATION_REQUIRED':
self.raise_login_required()
elif code == 'INVALID_LOCATION':
self.raise_geo_restricted(countries=['BE'])
raise ExtractorError(data.get('message') or code, expected=True)
# Note: The title may be an empty string
title = data['title'] or f'{site_id} {video_id}'
description = data.get('description')
formats = []
subtitles = {}
for target in data['targetUrls']:
format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type'))
if not format_url or not format_type:
continue
format_type = format_type.upper()
if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
fmts, subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
m3u8_id=format_type, fatal=False)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
elif format_type == 'HDS':
formats.extend(self._extract_f4m_formats(
format_url, video_id, f4m_id=format_type, fatal=False))
elif format_type == 'MPEG_DASH':
fmts, subs = self._extract_mpd_formats_and_subtitles(
format_url, video_id, mpd_id=format_type, fatal=False)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
elif format_type == 'HSS':
fmts, subs = self._extract_ism_formats_and_subtitles(
format_url, video_id, ism_id='mss', fatal=False)
formats.extend(fmts)
subtitles = self._merge_subtitles(subtitles, subs)
else:
formats.append({
'format_id': format_type,
'url': format_url,
})
subtitle_urls = data.get('subtitleUrls')
if isinstance(subtitle_urls, list):
for subtitle in subtitle_urls:
subtitle_url = subtitle.get('url')
if subtitle_url and subtitle.get('type') == 'CLOSED':
subtitles.setdefault('nl', []).append({'url': subtitle_url})
return {
'id': video_id,
'display_id': video_id,
'title': title,
'description': description,
'formats': formats,
'duration': float_or_none(data.get('duration'), 1000),
'thumbnail': data.get('posterImageUrl'),
'subtitles': subtitles,
}
class CanvasEenIE(InfoExtractor):
IE_DESC = 'canvas.be and een.be'
_VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
'md5': 'ed66976748d12350b118455979cca293',
'info_dict': {
'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
'display_id': 'de-afspraak-veilt-voor-de-warmste-week',
'ext': 'flv',
'title': 'De afspraak veilt voor de Warmste Week',
'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 49.02,
},
'expected_warnings': ['is not a supported codec'],
}, {
# with subtitles
'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167',
'info_dict': {
'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625',
'display_id': 'pieter-0167',
'ext': 'mp4',
'title': 'Pieter 0167',
'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2553.08,
'subtitles': {
'nl': [{
'ext': 'vtt',
}],
},
},
'params': {
'skip_download': True,
},
'skip': 'Pagina niet gevonden',
}, {
'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan',
'info_dict': {
'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8',
'display_id': 'emma-pakt-thilly-aan',
'ext': 'mp4',
'title': 'Emma pakt Thilly aan',
'description': 'md5:c5c9b572388a99b2690030afa3f3bad7',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 118.24,
},
'params': {
'skip_download': True,
},
'expected_warnings': ['is not a supported codec'],
}, {
'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = self._match_valid_url(url)
site_id, display_id = mobj.group('site_id'), mobj.group('id')
webpage = self._download_webpage(url, display_id)
title = strip_or_none(self._search_regex(
r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>',
webpage, 'title', default=None) or self._og_search_title(
webpage, default=None))
video_id = self._html_search_regex(
r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
group='id')
return {
'_type': 'url_transparent',
'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id),
'ie_key': CanvasIE.ie_key(),
'id': video_id,
'display_id': display_id,
'title': title,
'description': self._og_search_description(webpage),
}
class VrtNUIE(GigyaBaseIE):
IE_DESC = 'VrtNU.be'
_VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)'
_TESTS = [{
# Available via old API endpoint
'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/',
'info_dict': {
'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
'ext': 'mp4',
'title': 'Postbus X - Aflevering 1 (Seizoen 1989)',
'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7',
'duration': 1457.04,
'thumbnail': r're:^https?://.*\.jpg$',
'series': 'Postbus X',
'season': 'Seizoen 1989',
'season_number': 1989,
'episode': 'De zwarte weduwe',
'episode_number': 1,
'timestamp': 1595822400,
'upload_date': '20200727',
},
'skip': 'This video is only available for registered users',
'expected_warnings': ['is not a supported codec'],
}, {
# Only available via new API endpoint
'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/',
'info_dict': {
'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1',
'ext': 'mp4',
'title': 'Aflevering 5',
'description': 'Wie valt door de mand tijdens een missie?',
'duration': 2967.06,
'season': 'Season 1',
'season_number': 1,
'episode_number': 5,
},
'skip': 'This video is only available for registered users',
'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'],
}]
_NETRC_MACHINE = 'vrtnu'
_APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy'
_CONTEXT_ID = 'R3595707040'
def _perform_login(self, username, password):
auth_info = self._gigya_login({
'APIKey': self._APIKEY,
'targetEnv': 'jssdk',
'loginID': username,
'password': password,
'authMode': 'cookie',
})
if auth_info.get('errorDetails'):
raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True)
# Sometimes authentication fails for no good reason, retry
login_attempt = 1
while login_attempt <= 3:
try:
self._request_webpage('https://token.vrt.be/vrtnuinitlogin',
None, note='Requesting XSRF Token', errnote='Could not get XSRF Token',
query={'provider': 'site', 'destination': 'https://www.vrt.be/vrtnu/'})
post_data = {
'UID': auth_info['UID'],
'UIDSignature': auth_info['UIDSignature'],
'signatureTimestamp': auth_info['signatureTimestamp'],
'_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value,
}
self._request_webpage(
'https://login.vrt.be/perform_login',
None, note='Performing login', errnote='perform login failed',
headers={}, query={
'client_id': 'vrtnu-site'
}, data=urlencode_postdata(post_data))
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
login_attempt += 1
self.report_warning('Authentication failed')
self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again')
else:
raise e
else:
break
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
attrs = extract_attributes(self._search_regex(
r'(<nui-media[^>]+>)', webpage, 'media element'))
video_id = attrs['videoid']
publication_id = attrs.get('publicationid')
if publication_id:
video_id = publication_id + '$' + video_id
page = (self._parse_json(self._search_regex(
r'digitalData\s*=\s*({.+?});', webpage, 'digial data',
default='{}'), video_id, fatal=False) or {}).get('page') or {}
info = self._search_json_ld(webpage, display_id, default={})
return merge_dicts(info, {
'_type': 'url_transparent',
'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
'ie_key': CanvasIE.ie_key(),
'id': video_id,
'display_id': display_id,
'season_number': int_or_none(page.get('episode_season')),
})
class DagelijkseKostIE(InfoExtractor):
IE_DESC = 'dagelijksekost.een.be'
_VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof',
'md5': '30bfffc323009a3e5f689bef6efa2365',
'info_dict': {
'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa',
'display_id': 'hachis-parmentier-met-witloof',
'ext': 'mp4',
'title': 'Hachis parmentier met witloof',
'description': 'md5:9960478392d87f63567b5b117688cdc5',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 283.02,
},
'expected_warnings': ['is not a supported codec'],
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
title = strip_or_none(get_element_by_class(
'dish-metadata__title', webpage
) or self._html_search_meta(
'twitter:title', webpage))
description = clean_html(get_element_by_class(
'dish-description', webpage)
) or self._html_search_meta(
('description', 'twitter:description', 'og:description'),
webpage)
video_id = self._html_search_regex(
r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
group='id')
return {
'_type': 'url_transparent',
'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id,
'ie_key': CanvasIE.ie_key(),
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
}

View File

@ -1,70 +0,0 @@
from .canvas import CanvasIE
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
from ..utils import (
int_or_none,
parse_iso8601,
)
class KetnetIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P<id>(?:[^/]+/)*[^/?#&]+)'
_TESTS = [{
'url': 'https://www.ketnet.be/kijken/n/nachtwacht/3/nachtwacht-s3a1-de-greystook',
'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9',
'info_dict': {
'id': 'pbs-pub-aef8b526-115e-4006-aa24-e59ff6c6ef6f$vid-ddb815bf-c8e7-467b-8879-6bad7a32cebd',
'ext': 'mp4',
'title': 'Nachtwacht - Reeks 3: Aflevering 1',
'description': 'De Nachtwacht krijgt te maken met een parasiet',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1468.02,
'timestamp': 1609225200,
'upload_date': '20201229',
'series': 'Nachtwacht',
'season': 'Reeks 3',
'episode': 'De Greystook',
'episode_number': 1,
},
'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
}, {
'url': 'https://www.ketnet.be/themas/karrewiet/jaaroverzicht-20200/karrewiet-het-jaar-van-black-mamba',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
video = self._download_json(
'https://senior-bff.ketnet.be/graphql', display_id, query={
'query': '''{
video(id: "content/ketnet/nl/%s.model.json") {
description
episodeNr
imageUrl
mediaReference
programTitle
publicationDate
seasonTitle
subtitleVideodetail
titleVideodetail
}
}''' % display_id,
})['data']['video']
mz_id = compat_urllib_parse_unquote(video['mediaReference'])
return {
'_type': 'url_transparent',
'id': mz_id,
'title': video['titleVideodetail'],
'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/' + mz_id,
'thumbnail': video.get('imageUrl'),
'description': video.get('description'),
'timestamp': parse_iso8601(video.get('publicationDate')),
'series': video.get('programTitle'),
'season': video.get('seasonTitle'),
'episode': video.get('subtitleVideodetail'),
'episode_number': int_or_none(video.get('episodeNr')),
'ie_key': CanvasIE.ie_key(),
}

View File

@ -1,45 +1,137 @@
from .common import InfoExtractor import functools
import json
import time
import urllib.error
import urllib.parse
from .gigya import GigyaBaseIE
from ..utils import ( from ..utils import (
ExtractorError,
clean_html,
extract_attributes, extract_attributes,
float_or_none, float_or_none,
get_element_by_class, get_element_by_class,
get_element_html_by_class,
int_or_none,
join_nonempty,
jwt_encode_hs256,
make_archive_id,
parse_age_limit,
parse_iso8601,
str_or_none,
strip_or_none, strip_or_none,
unified_timestamp, traverse_obj,
url_or_none,
urlencode_postdata,
) )
class VRTIE(InfoExtractor): class VRTBaseIE(GigyaBaseIE):
_GEO_BYPASS = False
_PLAYER_INFO = {
'platform': 'desktop',
'app': {
'type': 'browser',
'name': 'Chrome',
},
'device': 'undefined (undefined)',
'os': {
'name': 'Windows',
'version': 'x86_64'
},
'player': {
'name': 'VRT web player',
'version': '2.7.4-prod-2023-04-19T06:05:45'
}
}
# From https://player.vrt.be/vrtnws/js/main.js & https://player.vrt.be/ketnet/js/main.fd1de01a40a1e3d842ea.js
_JWT_KEY_ID = '0-0Fp51UZykfaiCJrfTE3+oMI8zvDteYfPtR+2n1R+z8w='
_JWT_SIGNING_KEY = '2a9251d782700769fb856da5725daf38661874ca6f80ae7dc2b05ec1a81a24ae'
def _extract_formats_and_subtitles(self, data, video_id):
if traverse_obj(data, 'drm'):
self.report_drm(video_id)
formats, subtitles = [], {}
for target in traverse_obj(data, ('targetUrls', lambda _, v: url_or_none(v['url']) and v['type'])):
format_type = target['type'].upper()
format_url = target['url']
if format_type in ('HLS', 'HLS_AES'):
fmts, subs = self._extract_m3u8_formats_and_subtitles(
format_url, video_id, 'mp4', m3u8_id=format_type, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif format_type == 'HDS':
formats.extend(self._extract_f4m_formats(
format_url, video_id, f4m_id=format_type, fatal=False))
elif format_type == 'MPEG_DASH':
fmts, subs = self._extract_mpd_formats_and_subtitles(
format_url, video_id, mpd_id=format_type, fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
elif format_type == 'HSS':
fmts, subs = self._extract_ism_formats_and_subtitles(
format_url, video_id, ism_id='mss', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
formats.append({
'format_id': format_type,
'url': format_url,
})
for sub in traverse_obj(data, ('subtitleUrls', lambda _, v: v['url'] and v['type'] == 'CLOSED')):
subtitles.setdefault('nl', []).append({'url': sub['url']})
return formats, subtitles
def _call_api(self, video_id, client='null', id_token=None, version='v2'):
player_info = {'exp': (round(time.time(), 3) + 900), **self._PLAYER_INFO}
player_token = self._download_json(
'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2/tokens',
video_id, 'Downloading player token', headers={
**self.geo_verification_headers(),
'Content-Type': 'application/json',
}, data=json.dumps({
'identityToken': id_token or {},
'playerInfo': jwt_encode_hs256(player_info, self._JWT_SIGNING_KEY, headers={
'kid': self._JWT_KEY_ID
}).decode()
}, separators=(',', ':')).encode())['vrtPlayerToken']
return self._download_json(
f'https://media-services-public.vrt.be/media-aggregator/{version}/media-items/{video_id}',
video_id, 'Downloading API JSON', query={
'vrtPlayerToken': player_token,
'client': client,
}, expected_status=400)
class VRTIE(VRTBaseIE):
IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza' IE_DESC = 'VRT NWS, Flanders News, Flandern Info and Sporza'
_VALID_URL = r'https?://(?:www\.)?(?P<site>vrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' _VALID_URL = r'https?://(?:www\.)?(?P<site>vrt\.be/vrtnws|sporza\.be)/[a-z]{2}/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/', 'url': 'https://www.vrt.be/vrtnws/nl/2019/05/15/beelden-van-binnenkant-notre-dame-een-maand-na-de-brand/',
'md5': 'e1663accf5cf13f375f3cd0d10476669',
'info_dict': { 'info_dict': {
'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd', 'id': 'pbs-pub-7855fc7b-1448-49bc-b073-316cb60caa71$vid-2ca50305-c38a-4762-9890-65cbd098b7bd',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand', 'title': 'Beelden van binnenkant Notre-Dame, één maand na de brand',
'description': 'Op maandagavond 15 april ging een deel van het dakgebinte van de Parijse kathedraal in vlammen op.', 'description': 'md5:6fd85f999b2d1841aa5568f4bf02c3ff',
'timestamp': 1557924660,
'upload_date': '20190515',
'duration': 31.2, 'duration': 31.2,
'thumbnail': 'https://images.vrt.be/orig/2019/05/15/2d914d61-7710-11e9-abcc-02b7b76bf47f.jpg',
}, },
'params': {'skip_download': 'm3u8'},
}, { }, {
'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/', 'url': 'https://sporza.be/nl/2019/05/15/de-belgian-cats-zijn-klaar-voor-het-ek/',
'md5': '910bba927566e9ab992278f647eb4b75',
'info_dict': { 'info_dict': {
'id': 'pbs-pub-f2c86a46-8138-413a-a4b9-a0015a16ce2c$vid-1f112b31-e58e-4379-908d-aca6d80f8818', 'id': 'pbs-pub-f2c86a46-8138-413a-a4b9-a0015a16ce2c$vid-1f112b31-e58e-4379-908d-aca6d80f8818',
'ext': 'mp4', 'ext': 'mp4',
'title': 'De Belgian Cats zijn klaar voor het EK mét Ann Wauters', 'title': 'De Belgian Cats zijn klaar voor het EK',
'timestamp': 1557923760, 'description': 'Video: De Belgian Cats zijn klaar voor het EK mét Ann Wauters | basketbal, sport in het journaal',
'upload_date': '20190515',
'duration': 115.17, 'duration': 115.17,
'thumbnail': 'https://images.vrt.be/orig/2019/05/15/11c0dba3-770e-11e9-abcc-02b7b76bf47f.jpg',
}, },
}, { 'params': {'skip_download': 'm3u8'},
'url': 'https://www.vrt.be/vrtnws/en/2019/05/15/belgium_s-eurovision-entry-falls-at-the-first-hurdle/',
'only_matching': True,
}, {
'url': 'https://www.vrt.be/vrtnws/de/2019/05/15/aus-fuer-eliott-im-halbfinale-des-eurosongfestivals/',
'only_matching': True,
}] }]
_CLIENT_MAP = { _CLIENT_MAP = {
'vrt.be/vrtnws': 'vrtnieuws', 'vrt.be/vrtnws': 'vrtnieuws',
@ -49,34 +141,285 @@ class VRTIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
site, display_id = self._match_valid_url(url).groups() site, display_id = self._match_valid_url(url).groups()
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
attrs = extract_attributes(self._search_regex( attrs = extract_attributes(get_element_html_by_class('vrtvideo', webpage) or '')
r'(<[^>]+class="vrtvideo( [^"]*)?"[^>]*>)', webpage, 'vrt video'))
asset_id = attrs['data-video-id'] asset_id = attrs.get('data-video-id') or attrs['data-videoid']
publication_id = attrs.get('data-publication-id') publication_id = traverse_obj(attrs, 'data-publication-id', 'data-publicationid')
if publication_id: if publication_id:
asset_id = publication_id + '$' + asset_id asset_id = f'{publication_id}${asset_id}'
client = attrs.get('data-client-code') or self._CLIENT_MAP[site] client = traverse_obj(attrs, 'data-client-code', 'data-client') or self._CLIENT_MAP[site]
data = self._call_api(asset_id, client)
formats, subtitles = self._extract_formats_and_subtitles(data, asset_id)
title = strip_or_none(get_element_by_class(
'vrt-title', webpage) or self._html_search_meta(
['og:title', 'twitter:title', 'name'], webpage))
description = self._html_search_meta( description = self._html_search_meta(
['og:description', 'twitter:description', 'description'], webpage) ['og:description', 'twitter:description', 'description'], webpage)
if description == '': if description == '':
description = None description = None
timestamp = unified_timestamp(self._html_search_meta(
'article:published_time', webpage))
return { return {
'_type': 'url_transparent',
'id': asset_id, 'id': asset_id,
'display_id': display_id, 'formats': formats,
'title': title, 'subtitles': subtitles,
'description': description, 'description': description,
'thumbnail': attrs.get('data-posterimage'), 'thumbnail': url_or_none(attrs.get('data-posterimage')),
'timestamp': timestamp,
'duration': float_or_none(attrs.get('data-duration'), 1000), 'duration': float_or_none(attrs.get('data-duration'), 1000),
'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (client, asset_id), '_old_archive_ids': [make_archive_id('Canvas', asset_id)],
'ie_key': 'Canvas', **traverse_obj(data, {
'title': ('title', {str}),
'description': ('shortDescription', {str}),
'duration': ('duration', {functools.partial(float_or_none, scale=1000)}),
'thumbnail': ('posterImageUrl', {url_or_none}),
}),
}
class VrtNUIE(VRTBaseIE):
IE_DESC = 'VRT MAX'
_VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)'
_TESTS = [{
# CONTENT_IS_AGE_RESTRICTED
'url': 'https://www.vrt.be/vrtnu/a-z/de-ideale-wereld/2023-vj/de-ideale-wereld-d20230116/',
'info_dict': {
'id': 'pbs-pub-855b00a8-6ce2-4032-ac4f-1fcf3ae78524$vid-d2243aa1-ec46-4e34-a55b-92568459906f',
'ext': 'mp4',
'title': 'Tom Waes',
'description': 'Satirisch actualiteitenmagazine met Ella Leyers. Tom Waes is te gast.',
'timestamp': 1673905125,
'release_timestamp': 1673905125,
'series': 'De ideale wereld',
'season_id': '1672830988794',
'episode': 'Aflevering 1',
'episode_number': 1,
'episode_id': '1672830988861',
'display_id': 'de-ideale-wereld-d20230116',
'channel': 'VRT',
'duration': 1939.0,
'thumbnail': 'https://images.vrt.be/orig/2023/01/10/1bb39cb3-9115-11ed-b07d-02b7b76bf47f.jpg',
'release_date': '20230116',
'upload_date': '20230116',
'age_limit': 12,
},
}, {
'url': 'https://www.vrt.be/vrtnu/a-z/buurman--wat-doet-u-nu-/6/buurman--wat-doet-u-nu--s6-trailer/',
'info_dict': {
'id': 'pbs-pub-ad4050eb-d9e5-48c2-9ec8-b6c355032361$vid-0465537a-34a8-4617-8352-4d8d983b4eee',
'ext': 'mp4',
'title': 'Trailer seizoen 6 \'Buurman, wat doet u nu?\'',
'description': 'md5:197424726c61384b4e5c519f16c0cf02',
'timestamp': 1652940000,
'release_timestamp': 1652940000,
'series': 'Buurman, wat doet u nu?',
'season': 'Seizoen 6',
'season_number': 6,
'season_id': '1652344200907',
'episode': 'Aflevering 0',
'episode_number': 0,
'episode_id': '1652951873524',
'display_id': 'buurman--wat-doet-u-nu--s6-trailer',
'channel': 'VRT',
'duration': 33.13,
'thumbnail': 'https://images.vrt.be/orig/2022/05/23/3c234d21-da83-11ec-b07d-02b7b76bf47f.jpg',
'release_date': '20220519',
'upload_date': '20220519',
},
'params': {'skip_download': 'm3u8'},
}]
_NETRC_MACHINE = 'vrtnu'
_authenticated = False
def _perform_login(self, username, password):
auth_info = self._gigya_login({
'APIKey': '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy',
'targetEnv': 'jssdk',
'loginID': username,
'password': password,
'authMode': 'cookie',
})
if auth_info.get('errorDetails'):
raise ExtractorError(f'Unable to login. VrtNU said: {auth_info["errorDetails"]}', expected=True)
# Sometimes authentication fails for no good reason, retry
for retry in self.RetryManager():
if retry.attempt > 1:
self._sleep(1, None)
try:
self._request_webpage(
'https://token.vrt.be/vrtnuinitlogin', None, note='Requesting XSRF Token',
errnote='Could not get XSRF Token', query={
'provider': 'site',
'destination': 'https://www.vrt.be/vrtnu/',
})
self._request_webpage(
'https://login.vrt.be/perform_login', None,
note='Performing login', errnote='Login failed',
query={'client_id': 'vrtnu-site'}, data=urlencode_postdata({
'UID': auth_info['UID'],
'UIDSignature': auth_info['UIDSignature'],
'signatureTimestamp': auth_info['signatureTimestamp'],
'_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value,
}))
except ExtractorError as e:
if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401:
retry.error = e
continue
raise
self._authenticated = True
def _real_extract(self, url):
display_id = self._match_id(url)
parsed_url = urllib.parse.urlparse(url)
details = self._download_json(
f'{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip("/")}.model.json',
display_id, 'Downloading asset JSON', 'Unable to download asset JSON')['details']
watch_info = traverse_obj(details, (
'actions', lambda _, v: v['type'] == 'watch-episode', {dict}), get_all=False) or {}
video_id = join_nonempty(
'episodePublicationId', 'episodeVideoId', delim='$', from_dict=watch_info)
if '$' not in video_id:
raise ExtractorError('Unable to extract video ID')
vrtnutoken = self._download_json(
'https://token.vrt.be/refreshtoken', video_id, note='Retrieving vrtnutoken',
errnote='Token refresh failed')['vrtnutoken'] if self._authenticated else None
video_info = self._call_api(video_id, 'vrtnu-web@PROD', vrtnutoken)
if 'title' not in video_info:
code = video_info.get('code')
if code in ('AUTHENTICATION_REQUIRED', 'CONTENT_IS_AGE_RESTRICTED'):
self.raise_login_required(code, method='password')
elif code in ('INVALID_LOCATION', 'CONTENT_AVAILABLE_ONLY_IN_BE'):
self.raise_geo_restricted(countries=['BE'])
elif code == 'CONTENT_AVAILABLE_ONLY_FOR_BE_RESIDENTS_AND_EXPATS':
if not self._authenticated:
self.raise_login_required(code, method='password')
self.raise_geo_restricted(countries=['BE'])
raise ExtractorError(code, expected=True)
formats, subtitles = self._extract_formats_and_subtitles(video_info, video_id)
return {
**traverse_obj(details, {
'title': 'title',
'description': ('description', {clean_html}),
'timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}),
'release_timestamp': ('data', 'episode', 'onTime', 'raw', {parse_iso8601}),
'series': ('data', 'program', 'title'),
'season': ('data', 'season', 'title', 'value'),
'season_number': ('data', 'season', 'title', 'raw', {int_or_none}),
'season_id': ('data', 'season', 'id', {str_or_none}),
'episode': ('data', 'episode', 'number', 'value', {str_or_none}),
'episode_number': ('data', 'episode', 'number', 'raw', {int_or_none}),
'episode_id': ('data', 'episode', 'id', {str_or_none}),
'age_limit': ('data', 'episode', 'age', 'raw', {parse_age_limit}),
}),
'id': video_id,
'display_id': display_id,
'channel': 'VRT',
'formats': formats,
'duration': float_or_none(video_info.get('duration'), 1000),
'thumbnail': url_or_none(video_info.get('posterImageUrl')),
'subtitles': subtitles,
'_old_archive_ids': [make_archive_id('Canvas', video_id)],
}
class KetnetIE(VRTBaseIE):
_VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P<id>(?:[^/]+/)*[^/?#&]+)'
_TESTS = [{
'url': 'https://www.ketnet.be/kijken/m/meisjes/6/meisjes-s6a5',
'info_dict': {
'id': 'pbs-pub-39f8351c-a0a0-43e6-8394-205d597d6162$vid-5e306921-a9aa-4fa9-9f39-5b82c8f1028e',
'ext': 'mp4',
'title': 'Meisjes',
'episode': 'Reeks 6: Week 5',
'season': 'Reeks 6',
'series': 'Meisjes',
'timestamp': 1685251800,
'upload_date': '20230528',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
video = self._download_json(
'https://senior-bff.ketnet.be/graphql', display_id, query={
'query': '''{
video(id: "content/ketnet/nl/%s.model.json") {
description
episodeNr
imageUrl
mediaReference
programTitle
publicationDate
seasonTitle
subtitleVideodetail
titleVideodetail
}
}''' % display_id,
})['data']['video']
video_id = urllib.parse.unquote(video['mediaReference'])
data = self._call_api(video_id, 'ketnet@PROD', version='v1')
formats, subtitles = self._extract_formats_and_subtitles(data, video_id)
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'_old_archive_ids': [make_archive_id('Canvas', video_id)],
**traverse_obj(video, {
'title': ('titleVideodetail', {str}),
'description': ('description', {str}),
'thumbnail': ('thumbnail', {url_or_none}),
'timestamp': ('publicationDate', {parse_iso8601}),
'series': ('programTitle', {str}),
'season': ('seasonTitle', {str}),
'episode': ('subtitleVideodetail', {str}),
'episode_number': ('episodeNr', {int_or_none}),
}),
}
class DagelijkseKostIE(VRTBaseIE):
IE_DESC = 'dagelijksekost.een.be'
_VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof',
'info_dict': {
'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa',
'ext': 'mp4',
'title': 'Hachis parmentier met witloof',
'description': 'md5:9960478392d87f63567b5b117688cdc5',
'display_id': 'hachis-parmentier-met-witloof',
},
'params': {'skip_download': 'm3u8'},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._html_search_regex(
r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', group='id')
data = self._call_api(video_id, 'dako@prod', version='v1')
formats, subtitles = self._extract_formats_and_subtitles(data, video_id)
return {
'id': video_id,
'formats': formats,
'subtitles': subtitles,
'display_id': display_id,
'title': strip_or_none(get_element_by_class(
'dish-metadata__title', webpage) or self._html_search_meta('twitter:title', webpage)),
'description': clean_html(get_element_by_class(
'dish-description', webpage)) or self._html_search_meta(
['description', 'twitter:description', 'og:description'], webpage),
'_old_archive_ids': [make_archive_id('Canvas', video_id)],
} }