[youtube] Improve video upload date handling (#3029)

* Don't prefer UTC upload date for past live streams/premieres
* Improve regex (fixes a regression)

Authored-by: coletdjnz
This commit is contained in:
coletdev 2022-03-14 11:02:44 +13:00 committed by GitHub
parent 5ca764c506
commit 17322130a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 90 additions and 91 deletions

View File

@ -730,11 +730,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
timestamp = ( timestamp = (
unified_timestamp(text) or unified_timestamp( unified_timestamp(text) or unified_timestamp(
self._search_regex( self._search_regex(
(r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*on)?\s*(.+\d)', r'\w+[\s,\.-]*\w+[\s,\.-]+20\d{2}'), (r'([a-z]+\s*\d{1,2},?\s*20\d{2})', r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*(?:on|for))?\s*(.+\d)'),
text.lower(), 'time text', default=None))) text.lower(), 'time text', default=None)))
if text and timestamp is None: if text and timestamp is None:
self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True) self.report_warning(f"Cannot parse localized time text '{text}'" + bug_reports_message(), only_once=True)
return timestamp, text return timestamp, text
def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None,
@ -1204,7 +1204,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'Tq92D6wQ1mg', 'id': 'Tq92D6wQ1mg',
'title': '[MMD] Adios - EVERGLOW [+Motion DL]', 'title': '[MMD] Adios - EVERGLOW [+Motion DL]',
'ext': 'mp4', 'ext': 'mp4',
'upload_date': '20191227', 'upload_date': '20191228',
'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ', 'uploader_id': 'UC1yoRdFoFJaCY-AGfD9W0wQ',
'uploader': 'Projekt Melody', 'uploader': 'Projekt Melody',
'description': 'md5:17eccca93a786d51bc67646756894066', 'description': 'md5:17eccca93a786d51bc67646756894066',
@ -1297,6 +1297,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}, },
'expected_warnings': [ 'expected_warnings': [
'DASH manifest missing', 'DASH manifest missing',
'Some formats are possibly damaged'
] ]
}, },
# Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
@ -1569,7 +1570,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'md5:e41008789470fc2533a3252216f1c1d1', 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
'description': 'md5:a677553cf0840649b731a3024aeff4cc', 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
'duration': 721, 'duration': 721,
'upload_date': '20150127', 'upload_date': '20150128',
'uploader_id': 'BerkmanCenter', 'uploader_id': 'BerkmanCenter',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
'uploader': 'The Berkman Klein Center for Internet & Society', 'uploader': 'The Berkman Klein Center for Internet & Society',
@ -1601,7 +1602,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
'description': 'md5:13a2503d7b5904ef4b223aa101628f39', 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
'duration': 4060, 'duration': 4060,
'upload_date': '20151119', 'upload_date': '20151120',
'uploader': 'Bernie Sanders', 'uploader': 'Bernie Sanders',
'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
@ -3565,14 +3566,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
or self._extract_chapters_from_engagement_panel(initial_data, duration) or self._extract_chapters_from_engagement_panel(initial_data, duration)
or None) or None)
contents = try_get( contents = traverse_obj(
initial_data, initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'),
lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], expected_type=list, default=[])
list) or []
for content in contents: vpir = get_first(contents, 'videoPrimaryInfoRenderer')
vpir = content.get('videoPrimaryInfoRenderer')
if vpir: if vpir:
info['upload_date'] = strftime_or_none(self._extract_time_text(vpir, 'dateText')[0], '%Y%m%d')
stl = vpir.get('superTitleLink') stl = vpir.get('superTitleLink')
if stl: if stl:
stl = self._get_text(stl) stl = self._get_text(stl)
@ -3613,7 +3612,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'like_count': str_to_int(like_count), 'like_count': str_to_int(like_count),
'dislike_count': str_to_int(dislike_count), 'dislike_count': str_to_int(dislike_count),
}) })
vsir = content.get('videoSecondaryInfoRenderer') vsir = get_first(contents, 'videoSecondaryInfoRenderer')
if vsir: if vsir:
vor = traverse_obj(vsir, ('owner', 'videoOwnerRenderer')) vor = traverse_obj(vsir, ('owner', 'videoOwnerRenderer'))
info.update({ info.update({
@ -3652,15 +3651,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'channel_url': 'uploader_url', 'channel_url': 'uploader_url',
} }
# The upload date for scheduled and current live streams / premieres in microformats # The upload date for scheduled, live and past live streams / premieres in microformats
# is generally the true upload date. Although not in UTC, we will prefer that in this case. # may be different from the stream date. Although not in UTC, we will prefer it in this case.
# Note this changes to the published date when the stream/premiere has finished.
# See: https://github.com/yt-dlp/yt-dlp/pull/2223#issuecomment-1008485139 # See: https://github.com/yt-dlp/yt-dlp/pull/2223#issuecomment-1008485139
if not info.get('upload_date') or info.get('is_live') or info.get('live_status') == 'is_upcoming': upload_date = (
info['upload_date'] = (
unified_strdate(get_first(microformats, 'uploadDate')) unified_strdate(get_first(microformats, 'uploadDate'))
or unified_strdate(search_meta('uploadDate')) or unified_strdate(search_meta('uploadDate')))
or info.get('upload_date')) if not upload_date or (not info.get('is_live') and not info.get('was_live') and info.get('live_status') != 'is_upcoming'):
upload_date = strftime_or_none(self._extract_time_text(vpir, 'dateText')[0], '%Y%m%d')
info['upload_date'] = upload_date
for to, frm in fallbacks.items(): for to, frm in fallbacks.items():
if not info.get(to): if not info.get(to):