[extractor/niconico:live] Add extractor (#5764)

Authored by: Lesmiscore
2023-05-29 18:35:10 +09:00 · 2023-05-29 18:35:10 +09:00 · f8f9250fe2
parent 3459d3c5af
commit f8f9250fe2
4 changed files with 266 additions and 2 deletions
--- a/yt_dlp/downloader/init.py
+++ b/yt_dlp/downloader/init.py
@ -30,7 +30,7 @@ from .hls import HlsFD
 from .http import HttpFD
 from .ism import IsmFD
 from .mhtml import MhtmlFD
-from .niconico import NiconicoDmcFD
+from .niconico import NiconicoDmcFD, NiconicoLiveFD
 from .rtmp import RtmpFD
 from .rtsp import RtspFD
 from .websocket import WebSocketFragmentFD
@ -50,6 +50,7 @@ PROTOCOL_MAP = {
    'ism': IsmFD,
    'mhtml': MhtmlFD,
    'niconico_dmc': NiconicoDmcFD,
    'niconico_live': NiconicoLiveFD,
    'fc2_live': FC2LiveFD,
    'websocket_frag': WebSocketFragmentFD,
    'youtube_live_chat': YoutubeLiveChatFD,
--- a/yt_dlp/downloader/niconico.py
+++ b/yt_dlp/downloader/niconico.py
@ -1,8 +1,17 @@
 import json
 import threading
 import time
 from . import get_suitable_downloader
 from .common import FileDownloader
-from ..utils import sanitized_Request
+from .external import FFmpegFD
 from ..utils import (
    DownloadError,
    str_or_none,
    sanitized_Request,
    WebSocketsWrapper,
    try_get,
 )
 class NiconicoDmcFD(FileDownloader):
@ -50,3 +59,93 @@ class NiconicoDmcFD(FileDownloader):
                    timer[0].cancel()
                    download_complete = True
        return success
 class NiconicoLiveFD(FileDownloader):
    """ Downloads niconico live without being stopped """
    def real_download(self, filename, info_dict):
        video_id = info_dict['video_id']
        ws_url = info_dict['url']
        ws_extractor = info_dict['ws']
        ws_origin_host = info_dict['origin']
        cookies = info_dict.get('cookies')
        live_quality = info_dict.get('live_quality', 'high')
        live_latency = info_dict.get('live_latency', 'high')
        dl = FFmpegFD(self.ydl, self.params or {})
        new_info_dict = info_dict.copy()
        new_info_dict.update({
            'protocol': 'm3u8',
        })
        def communicate_ws(reconnect):
            if reconnect:
                ws = WebSocketsWrapper(ws_url, {
                    'Cookies': str_or_none(cookies) or '',
                    'Origin': f'https://{ws_origin_host}',
                    'Accept': '*/*',
                    'User-Agent': self.params['http_headers']['User-Agent'],
                })
                if self.ydl.params.get('verbose', False):
                    self.to_screen('[debug] Sending startWatching request')
                ws.send(json.dumps({
                    'type': 'startWatching',
                    'data': {
                        'stream': {
                            'quality': live_quality,
                            'protocol': 'hls+fmp4',
                            'latency': live_latency,
                            'chasePlay': False
                        },
                        'room': {
                            'protocol': 'webSocket',
                            'commentable': True
                        },
                        'reconnect': True,
                    }
                }))
            else:
                ws = ws_extractor
            with ws:
                while True:
                    recv = ws.recv()
                    if not recv:
                        continue
                    data = json.loads(recv)
                    if not data or not isinstance(data, dict):
                        continue
                    if data.get('type') == 'ping':
                        # pong back
                        ws.send(r'{"type":"pong"}')
                        ws.send(r'{"type":"keepSeat"}')
                    elif data.get('type') == 'disconnect':
                        self.write_debug(data)
                        return True
                    elif data.get('type') == 'error':
                        self.write_debug(data)
                        message = try_get(data, lambda x: x['body']['code'], str) or recv
                        return DownloadError(message)
                    elif self.ydl.params.get('verbose', False):
                        if len(recv) > 100:
                            recv = recv[:100] + '...'
                        self.to_screen('[debug] Server said: %s' % recv)
        def ws_main():
            reconnect = False
            while True:
                try:
                    ret = communicate_ws(reconnect)
                    if ret is True:
                        return
                except BaseException as e:
                    self.to_screen('[%s] %s: Connection error occured, reconnecting after 10 seconds: %s' % ('niconico:live', video_id, str_or_none(e)))
                    time.sleep(10)
                    continue
                finally:
                    reconnect = True
        thread = threading.Thread(target=ws_main, daemon=True)
        thread.start()
        return dl.download(filename, new_info_dict)
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -1275,6 +1275,7 @@ from .niconico import (
    NicovideoSearchIE,
    NicovideoSearchURLIE,
    NicovideoTagURLIE,
    NiconicoLiveIE,
 )
 from .ninecninemedia import (
    NineCNineMediaIE,
--- a/yt_dlp/extractor/niconico.py
+++ b/yt_dlp/extractor/niconico.py
@ -5,13 +5,17 @@ import json
 import re
 import time
 from urllib.parse import urlparse
 from .common import InfoExtractor, SearchInfoExtractor
 from ..compat import (
    compat_HTTPError,
 )
 from ..dependencies import websockets
 from ..utils import (
    ExtractorError,
    OnDemandPagedList,
    WebSocketsWrapper,
    bug_reports_message,
    clean_html,
    float_or_none,
@ -895,3 +899,162 @@ class NiconicoUserIE(InfoExtractor):
    def _real_extract(self, url):
        list_id = self._match_id(url)
        return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key())
 class NiconicoLiveIE(InfoExtractor):
    IE_NAME = 'niconico:live'
    IE_DESC = 'ニコニコ生放送'
    _VALID_URL = r'https?://(?:sp\.)?live2?\.nicovideo\.jp/(?:watch|gate)/(?P<id>lv\d+)'
    _TESTS = [{
        'note': 'this test case includes invisible characters for title, pasting them as-is',
        'url': 'https://live.nicovideo.jp/watch/lv339533123',
        'info_dict': {
            'id': 'lv339533123',
            'title': '激辛ペヤング食べます‪( ;ᯅ; )‬（歌枠オーディション参加中）',
            'view_count': 1526,
            'comment_count': 1772,
            'description': '初めましてもかって言います❕\nのんびり自由に適当に暮らしてます',
            'uploader': 'もか',
            'channel': 'ゲストさんのコミュニティ',
            'channel_id': 'co5776900',
            'channel_url': 'https://com.nicovideo.jp/community/co5776900',
            'timestamp': 1670677328,
            'is_live': True,
        },
        'skip': 'livestream',
    }, {
        'url': 'https://live2.nicovideo.jp/watch/lv339533123',
        'only_matching': True,
    }, {
        'url': 'https://sp.live.nicovideo.jp/watch/lv339533123',
        'only_matching': True,
    }, {
        'url': 'https://sp.live2.nicovideo.jp/watch/lv339533123',
        'only_matching': True,
    }]
    _KNOWN_LATENCY = ('high', 'low')
    def _real_extract(self, url):
        if not websockets:
            raise ExtractorError('websockets library is not available. Please install it.', expected=True)
        video_id = self._match_id(url)
        webpage, urlh = self._download_webpage_handle(f'https://live.nicovideo.jp/watch/{video_id}', video_id)
        embedded_data = self._parse_json(unescapeHTML(self._search_regex(
            r'<script\s+id="embedded-data"\s*data-props="(.+?)"', webpage, 'embedded data')), video_id)
        ws_url = traverse_obj(embedded_data, ('site', 'relive', 'webSocketUrl'))
        if not ws_url:
            raise ExtractorError('The live hasn\'t started yet or already ended.', expected=True)
        ws_url = update_url_query(ws_url, {
            'frontend_id': traverse_obj(embedded_data, ('site', 'frontendId')) or '9',
        })
        hostname = remove_start(urlparse(urlh.geturl()).hostname, 'sp.')
        cookies = try_get(urlh.geturl(), self._downloader._calc_cookies)
        latency = try_get(self._configuration_arg('latency'), lambda x: x[0])
        if latency not in self._KNOWN_LATENCY:
            latency = 'high'
        ws = WebSocketsWrapper(ws_url, {
            'Cookies': str_or_none(cookies) or '',
            'Origin': f'https://{hostname}',
            'Accept': '*/*',
            'User-Agent': self.get_param('http_headers')['User-Agent'],
        })
        self.write_debug('[debug] Sending HLS server request')
        ws.send(json.dumps({
            'type': 'startWatching',
            'data': {
                'stream': {
                    'quality': 'abr',
                    'protocol': 'hls+fmp4',
                    'latency': latency,
                    'chasePlay': False
                },
                'room': {
                    'protocol': 'webSocket',
                    'commentable': True
                },
                'reconnect': False,
            }
        }))
        while True:
            recv = ws.recv()
            if not recv:
                continue
            data = json.loads(recv)
            if not isinstance(data, dict):
                continue
            if data.get('type') == 'stream':
                m3u8_url = data['data']['uri']
                qualities = data['data']['availableQualities']
                break
            elif data.get('type') == 'disconnect':
                self.write_debug(recv)
                raise ExtractorError('Disconnected at middle of extraction')
            elif data.get('type') == 'error':
                self.write_debug(recv)
                message = traverse_obj(data, ('body', 'code')) or recv
                raise ExtractorError(message)
            elif self.get_param('verbose', False):
                if len(recv) > 100:
                    recv = recv[:100] + '...'
                self.write_debug('Server said: %s' % recv)
        title = traverse_obj(embedded_data, ('program', 'title')) or self._html_search_meta(
            ('og:title', 'twitter:title'), webpage, 'live title', fatal=False)
        raw_thumbs = traverse_obj(embedded_data, ('program', 'thumbnail')) or {}
        thumbnails = []
        for name, value in raw_thumbs.items():
            if not isinstance(value, dict):
                thumbnails.append({
                    'id': name,
                    'url': value,
                    **parse_resolution(value, lenient=True),
                })
                continue
            for k, img_url in value.items():
                res = parse_resolution(k, lenient=True) or parse_resolution(img_url, lenient=True)
                width, height = res.get('width'), res.get('height')
                thumbnails.append({
                    'id': f'{name}_{width}x{height}',
                    'url': img_url,
                    **res,
                })
        formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', live=True)
        for fmt, q in zip(formats, reversed(qualities[1:])):
            fmt.update({
                'format_id': q,
                'protocol': 'niconico_live',
                'ws': ws,
                'video_id': video_id,
                'cookies': cookies,
                'live_latency': latency,
                'origin': hostname,
            })
        return {
            'id': video_id,
            'title': title,
            **traverse_obj(embedded_data, {
                'view_count': ('program', 'statistics', 'watchCount'),
                'comment_count': ('program', 'statistics', 'commentCount'),
                'uploader': ('program', 'supplier', 'name'),
                'channel': ('socialGroup', 'name'),
                'channel_id': ('socialGroup', 'id'),
                'channel_url': ('socialGroup', 'socialGroupPageUrl'),
            }),
            'description': clean_html(traverse_obj(embedded_data, ('program', 'description'))),
            'timestamp': int_or_none(traverse_obj(embedded_data, ('program', 'openTime'))),
            'is_live': True,
            'thumbnails': thumbnails,
            'formats': formats,
        }