From 361630015535026712bdb67f804a15b65ff9ee7e Mon Sep 17 00:00:00 2001 From: HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> Date: Fri, 17 Feb 2023 12:19:24 +0900 Subject: [PATCH] [extractor/yappy] Add extractor (#6111) Authored by: HobbyistDev Closes #3522 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/yappy.py | 99 +++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 yt_dlp/extractor/yappy.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 0a36e98de2..4aab6ea78f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2329,6 +2329,7 @@ from .yandexvideo import ( ZenYandexChannelIE, ) from .yapfiles import YapFilesIE +from .yappy import YappyIE from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE from .yle_areena import YleAreenaIE diff --git a/yt_dlp/extractor/yappy.py b/yt_dlp/extractor/yappy.py new file mode 100644 index 0000000000..f168bdbf9a --- /dev/null +++ b/yt_dlp/extractor/yappy.py @@ -0,0 +1,99 @@ +from .common import InfoExtractor +from ..utils import ( + int_or_none, + traverse_obj, + unified_timestamp, + url_or_none +) + + +class YappyIE(InfoExtractor): + _VALID_URL = r'https?://yappy\.media/video/(?P\w+)' + _TESTS = [{ + 'url': 'https://yappy.media/video/47fea6d8586f48d1a0cf96a7342aabd2', + 'info_dict': { + 'id': '47fea6d8586f48d1a0cf96a7342aabd2', + 'ext': 'mp4', + 'title': 'Куда нажимать? Как снимать? Смотри видос и погнали!🤘🏻', + 'timestamp': 1661893200, + 'description': 'Куда нажимать? Как снимать? Смотри видос и погнали!🤘🏻', + 'thumbnail': 'https://cdn-st.ritm.media/static/pic/thumbnails/0c7c4d73388f47848acaf540d2e2bb8c-thumbnail.jpg', + 'upload_date': '20220830', + 'view_count': int, + 'like_count': int, + 'uploader_id': '59a0c8c485e5410b9c43474bf4c6a373', + 'categories': ['Образование и наука', 'Лайфхак', 'Технологии', 'Арт/искусство'], + 'repost_count': int, + 'uploader': 'YAPPY', + } + }, { + 'url': 'https://yappy.media/video/3862451954ad4bd58ae2ccefddb0bd33', + 'info_dict': { + 'id': '3862451954ad4bd58ae2ccefddb0bd33', + 'ext': 'mp4', + 'title': 'Опиши свой характер 3 словами🙃\n#психология #дружба #отношения', + 'timestamp': 1674726985, + 'like_count': int, + 'description': 'Опиши свой характер 3 словами🙃\n#психология #дружба #отношения', + 'uploader_id': '6793ee3581974a3586fc01e157de6c99', + 'view_count': int, + 'repost_count': int, + 'uploader': 'LENA SHTURMAN', + 'upload_date': '20230126', + 'thumbnail': 'https://cdn-st.ritm.media/static/pic/user_thumbnails/6e76bb4bbad640b6/9ec84c115b2b1967/1674716171.jpg', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_ld = self._search_json_ld(webpage, video_id) + nextjs_data = self._search_nextjs_data(webpage, video_id) + + media_data = ( + traverse_obj( + nextjs_data, ('props', 'pageProps', ('data', 'OpenGraphParameters')), get_all=False) + or self._download_json(f'https://yappy.media/api/video/{video_id}', video_id)) + + media_url = traverse_obj(media_data, ('link', {url_or_none})) or '' + has_watermark = media_url.endswith('-wm.mp4') + + formats = [{ + 'url': media_url, + 'ext': 'mp4', + 'format_note': 'Watermarked' if has_watermark else None, + 'preference': -10 if has_watermark else None + }] if media_url else [] + + if has_watermark: + formats.append({ + 'url': media_url.replace('-wm.mp4', '.mp4'), + 'ext': 'mp4' + }) + + audio_link = traverse_obj(media_data, ('audio', 'link')) + if audio_link: + formats.append({ + 'url': audio_link, + 'ext': 'mp3', + 'acodec': 'mp3', + 'vcodec': 'none' + }) + + return { + 'id': video_id, + 'title': (json_ld.get('description') or self._html_search_meta(['og:title'], webpage) + or self._html_extract_title(webpage)), + 'formats': formats, + 'thumbnail': (media_data.get('thumbnail') + or self._html_search_meta(['og:image', 'og:image:secure_url'], webpage)), + 'description': (media_data.get('description') or json_ld.get('description') + or self._html_search_meta(['description', 'og:description'], webpage)), + 'timestamp': unified_timestamp(media_data.get('publishedAt') or json_ld.get('timestamp')), + 'view_count': int_or_none(media_data.get('viewsCount') or json_ld.get('view_count')), + 'like_count': int_or_none(media_data.get('likesCount')), + 'uploader': traverse_obj(media_data, ('creator', 'firstName')), + 'uploader_id': traverse_obj(media_data, ('creator', ('uuid', 'nickname')), get_all=False), + 'categories': traverse_obj(media_data, ('categories', ..., 'name')) or None, + 'repost_count': int_or_none(media_data.get('sharingCount')) + }