Compare commits

...

5 Commits

Author SHA1 Message Date
Subrat Lima 4743918c09
Merge 2c0244cb2f into 46f4c80bc3 2024-09-07 17:21:53 +02:00
sepro 46f4c80bc3
[ie/SampleFocus] Fix extractor (#10947)
Closes #10945
Authored by: seproDev
2024-09-07 17:06:12 +02:00
subrat-lima 2c0244cb2f [ie/atptour] refactored url pattern for better extensibility 2024-09-01 21:46:17 +05:30
subrat-lima 2fe0226c0f [ie/atptour] enhancement - added support for spanish pages 2024-09-01 20:17:25 +05:30
subrat-lima 485cbe4990 [ie/atptour] add extractor and updated data extraction function 2024-09-01 17:06:24 +05:30
4 changed files with 146 additions and 4 deletions

View File

@ -169,6 +169,10 @@ from .asobichannel import (
AsobiChannelTagURLIE,
)
from .asobistage import AsobiStageIE
from .atptour import (
ATPTourNewsIE,
ATPTourVideoIE,
)
from .atresplayer import AtresPlayerIE
from .atscaleconf import AtScaleConfEventIE
from .atvat import ATVAtIE

127
yt_dlp/extractor/atptour.py Normal file
View File

@ -0,0 +1,127 @@
import re
from .brightcove import BrightcoveNewIE
from .common import InfoExtractor
from ..utils import base_url, extract_attributes, get_element_html_by_id, traverse_obj, urljoin
class ATPTourVideoIE(InfoExtractor):
IE_NAME = 'atptour:video'
_VALID_URL = r'https?://(?:www\.)?atptour\.com/(?:en|es)/video/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.atptour.com/en/video/challenger-highlights-nishikori-wins-in-como-2024',
'md5': '4721002227d98fe89afafa40eba3068d',
'info_dict': {
'id': '6361099221112',
'ext': 'mp4',
'description': 'md5:ef8afed21c52cbe4ad3409045d59f413',
'upload_date': '20240827',
'duration': 105.152,
'tags': 'count:6',
'thumbnail': r're:^https?://.*\.jpg$',
'title': 'Challenger Highlights: Nishikori wins in Como 2024',
'uploader_id': '6057277721001',
'timestamp': 1724775281,
},
}, {
'url': 'https://www.atptour.com/en/video/highlights-svajda-earns-highestranked-win-of-career-vs-cerundolo-winstonsalem-2024',
'md5': 'a3829d10bdcb1829568fd88b9e6ecb15',
'info_dict': {
'id': '6360716257112',
'ext': 'mp4',
'description': 'md5:a334aeb73eac631ffab8249b1e68194c',
'upload_date': '20240820',
'duration': 139.691,
'tags': 'count:5',
'thumbnail': r're:^https?://.*\.jpg$',
'title': 'Highlights: Svajda earns highest-ranked win of career vs. Cerundolo Winston-Salem 2024',
'uploader_id': '6057277721001',
'timestamp': 1724183755,
},
}, {
'url': 'https://www.atptour.com/es/video/highlights-michelsen-defeats-fucsovics-in-winston-salem-2024',
'md5': '7ba4c3aabef9eb20a1b9877f28e6f775',
'info_dict': {
'id': '6360727636112',
'ext': 'mp4',
'description': 'md5:2c5682fdfa514e508c6d947e9e9b6eeb',
'upload_date': '20240821',
'duration': 135.424,
'tags': 'count:6',
'thumbnail': r're:^https?://.*\.jpg$',
'title': 'Highlights: Michelsen defeats Fucsovics in Winston-Salem 2024',
'uploader_id': '6057277721001',
'timestamp': 1724205624,
},
}, {
'url': 'https://www.atptour.com/en/video/highlights-sonego-dominates-michelsen-for-winston-salem-open-title-2024',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id, fatal=False, impersonate=True)
hidden_inputs = self._hidden_inputs(webpage, 'class')
featured_videos_url = urljoin(base_url(url), hidden_inputs.get('atp_featured-videos-endpoint'))
json_data = self._download_json(featured_videos_url, display_id, fatal=False, impersonate=True)
video_data = traverse_obj(json_data, ('content', 0))
account_id = traverse_obj(video_data, ('videoAccountId'))
player_id = traverse_obj(video_data, ('videoPlayerId'))
video_id = traverse_obj(video_data, ('videoId'))
return self.url_result(
f'https://players.brightcove.net/{account_id}/{player_id}/index.html?videoId={video_id}', BrightcoveNewIE)
class ATPTourNewsIE(InfoExtractor):
IE_NAME = 'atptour:news'
_VALID_URL = r'https?://(?:www\.)?atptour\.com/(?:en|es)/news/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://www.atptour.com/en/news/sinner-zverev-cincinnati-2024-sf',
'playlist_mincount': 2,
'info_dict': {
'id': 'sinner-zverev-cincinnati-2024-sf',
'title': 'Jannik Sinner battles past Alexander Zverev to reach Cincinnati final | ATP Tour | Tennis',
'description': 'md5:30cd3df666c8a5d45731d1e85d8d43ae',
},
}, {
'url': 'https://www.atptour.com/en/news/borges-us-open-2024-this-is-tennis',
'playlist_mincount': 1,
'info_dict': {
'id': 'borges-us-open-2024-this-is-tennis',
'title': 'Nuno Borges: Building legos, facing Nadal, Cirque du Soleil & more | ATP Tour | Tennis',
'description': 'md5:aaef866660c4e3ced69118c0f6ed237a',
},
}, {
'url': 'https://www.atptour.com/es/news/popyrin-us-open-2024-feature',
'playlist_mincount': 1,
'info_dict': {
'id': 'popyrin-us-open-2024-feature',
'title': 'Alexei Popyrin: Hamilton, pollo frito y la revancha de Djokovic | ATP Tour | Tennis',
'description': 'md5:b62a35720a278c9ab8410847915dc581',
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id, fatal=False, impersonate=True)
title = self._html_extract_title(webpage)
description = self._og_search_description(webpage)
entries = []
first_video = get_element_html_by_id('articleVideoJSPlayer', webpage)
if first_video is not None:
attributes = extract_attributes(first_video)
account_id = traverse_obj(attributes, ('data-account'))
player_id = traverse_obj(attributes, ('data-player'))
video_id = traverse_obj(attributes, ('data-video-id'))
first_video_url = f'https://players.brightcove.net/{account_id}/{player_id}/index.html?videoId={video_id}'
entries.append(self.url_result(first_video_url, BrightcoveNewIE))
iframe_urls = re.findall(r'<iframe[^>]src="(https://players\.brightcove\.net/[^"]+)"', webpage)
for video_url in iframe_urls:
entries.append(self.url_result(video_url, BrightcoveNewIE))
return self.playlist_result(entries, display_id, title, description)

View File

@ -1779,7 +1779,7 @@ class InfoExtractor:
return traverse_obj(ret, traverse) or {}
@staticmethod
def _hidden_inputs(html):
def _hidden_inputs(html, attr_list=('name', 'id')):
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
hidden_inputs = {}
for input_el in re.findall(r'(?i)(<input[^>]+>)', html):
@ -1788,7 +1788,10 @@ class InfoExtractor:
continue
if attrs.get('type') not in ('hidden', 'submit'):
continue
name = attrs.get('name') or attrs.get('id')
for attr in variadic(attr_list):
name = attrs.get(attr)
if name is not None:
break
value = attrs.get('value')
if name and value is not None:
hidden_inputs[name] = value

View File

@ -36,7 +36,7 @@ class SampleFocusIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
webpage = self._download_webpage(url, display_id, impersonate=True)
sample_id = self._search_regex(
r'<input[^>]+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P<id>\d+)',
@ -82,7 +82,15 @@ class SampleFocusIE(InfoExtractor):
return {
'id': sample_id,
'title': title,
'url': mp3_url,
'formats': [{
'url': mp3_url,
'ext': 'mp3',
'vcodec': 'none',
'acodec': 'mp3',
'http_headers': {
'Referer': url,
},
}],
'display_id': display_id,
'thumbnail': thumbnail,
'uploader': uploader,