From 485cbe49904dc8b9748e43f346f669788658a9d4 Mon Sep 17 00:00:00 2001 From: subrat-lima Date: Sun, 1 Sep 2024 17:06:24 +0530 Subject: [PATCH] [ie/atptour] add extractor and updated data extraction function --- yt_dlp/extractor/_extractors.py | 4 ++ yt_dlp/extractor/atptour.py | 106 ++++++++++++++++++++++++++++++++ yt_dlp/extractor/common.py | 7 ++- 3 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 yt_dlp/extractor/atptour.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a3610dc97..f486d46cd 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -169,6 +169,10 @@ from .asobichannel import ( AsobiChannelTagURLIE, ) from .asobistage import AsobiStageIE +from .atptour import ( + ATPTourNewsIE, + ATPTourVideoIE, +) from .atresplayer import AtresPlayerIE from .atscaleconf import AtScaleConfEventIE from .atvat import ATVAtIE diff --git a/yt_dlp/extractor/atptour.py b/yt_dlp/extractor/atptour.py new file mode 100644 index 000000000..2994f5739 --- /dev/null +++ b/yt_dlp/extractor/atptour.py @@ -0,0 +1,106 @@ +import re + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor +from ..utils import base_url, extract_attributes, get_element_html_by_id, traverse_obj, urljoin + + +class ATPTourVideoIE(InfoExtractor): + IE_NAME = 'atptour:video' + _VALID_URL = r'https?://(?:www\.)?atptour\.com/en/video/(?P[\w]+)' + _TESTS = [{ + 'url': 'https://www.atptour.com/en/video/challenger-highlights-nishikori-wins-in-como-2024', + 'md5': '4721002227d98fe89afafa40eba3068d', + 'info_dict': { + 'id': '6361099221112', + 'ext': 'mp4', + 'description': 'md5:ef8afed21c52cbe4ad3409045d59f413', + 'upload_date': '20240827', + 'duration': 105.152, + 'tags': 'count:6', + 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Challenger Highlights: Nishikori wins in Como 2024', + 'uploader_id': '6057277721001', + 'timestamp': 1724775281, + }, + }, { + 'url': 'https://www.atptour.com/en/video/highlights-svajda-earns-highestranked-win-of-career-vs-cerundolo-winstonsalem-2024', + 'md5': 'a3829d10bdcb1829568fd88b9e6ecb15', + 'info_dict': { + 'id': '6360716257112', + 'ext': 'mp4', + 'description': 'md5:a334aeb73eac631ffab8249b1e68194c', + 'upload_date': '20240820', + 'duration': 139.691, + 'tags': 'count:5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Highlights: Svajda earns highest-ranked win of career vs. Cerundolo Winston-Salem 2024', + 'uploader_id': '6057277721001', + 'timestamp': 1724183755, + }, + }, { + 'url': 'https://www.atptour.com/en/video/highlights-sonego-dominates-michelsen-for-winston-salem-open-title-2024', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id, fatal=False, impersonate=True) + + hidden_inputs = self._hidden_inputs(webpage, 'class') + featured_videos_url = urljoin(base_url(url), hidden_inputs.get('atp_featured-videos-endpoint')) + json_data = self._download_json(featured_videos_url, display_id, fatal=False, impersonate=True) + video_data = traverse_obj(json_data, ('content', 0)) + account_id = traverse_obj(video_data, ('videoAccountId')) + player_id = traverse_obj(video_data, ('videoPlayerId')) + video_id = traverse_obj(video_data, ('videoId')) + return self.url_result( + f'https://players.brightcove.net/{account_id}/{player_id}/index.html?videoId={video_id}', BrightcoveNewIE) + + +class ATPTourNewsIE(InfoExtractor): + IE_NAME = 'atptour:news' + _VALID_URL = r'https?://(?:www\.)?atptour\.com/en/news/(?P[\w-]+)' + _TESTS = [{ + 'url': 'https://www.atptour.com/en/news/sinner-zverev-cincinnati-2024-sf', + 'md5': '4721002227d98fe89afafa40eba3068d', + 'playlist_mincount': 2, + 'info_dict': { + 'id': 'sinner-zverev-cincinnati-2024-sf', + 'title': 'Jannik Sinner battles past Alexander Zverev to reach Cincinnati final | ATP Tour | Tennis', + 'description': 'md5:30cd3df666c8a5d45731d1e85d8d43ae', + }, + }, { + 'url': 'https://www.atptour.com/en/news/borges-us-open-2024-this-is-tennis', + 'md5': 'abcd', + 'playlist_mincount': 1, + 'info_dict': { + 'id': 'borges-us-open-2024-this-is-tennis', + 'title': 'Nuno Borges: Building legos, facing Nadal, Cirque du Soleil & more | ATP Tour | Tennis', + 'description': 'md5:aaef866660c4e3ced69118c0f6ed237a', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id, fatal=False, impersonate=True) + + title = self._html_extract_title(webpage) + description = self._og_search_description(webpage) + + entries = [] + + first_video = get_element_html_by_id('articleVideoJSPlayer', webpage) + if first_video is not None: + attributes = extract_attributes(first_video) + account_id = traverse_obj(attributes, ('data-account')) + player_id = traverse_obj(attributes, ('data-player')) + video_id = traverse_obj(attributes, ('data-video-id')) + first_video_url = f'https://players.brightcove.net/{account_id}/{player_id}/index.html?videoId={video_id}' + entries.append(self.url_result(first_video_url, BrightcoveNewIE)) + + iframe_urls = re.findall(r']src="(https://players\.brightcove\.net/[^"]+)"', webpage) + for video_url in iframe_urls: + entries.append(self.url_result(video_url, BrightcoveNewIE)) + + return self.playlist_result(entries, display_id, title, description) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 187f73e7b..16271cbc0 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1779,7 +1779,7 @@ class InfoExtractor: return traverse_obj(ret, traverse) or {} @staticmethod - def _hidden_inputs(html): + def _hidden_inputs(html, attr_list=('name', 'id')): html = re.sub(r'', '', html) hidden_inputs = {} for input_el in re.findall(r'(?i)(]+>)', html): @@ -1788,7 +1788,10 @@ class InfoExtractor: continue if attrs.get('type') not in ('hidden', 'submit'): continue - name = attrs.get('name') or attrs.get('id') + for attr in variadic(attr_list): + name = attrs.get(attr) + if name is not None: + break value = attrs.get('value') if name and value is not None: hidden_inputs[name] = value