From 305d60a28e5fc39934b9a63c2dfd626b0dacdc2b Mon Sep 17 00:00:00 2001 From: Risenafis Date: Sun, 9 Jun 2024 12:03:09 +0900 Subject: [PATCH] add SBVits2TtsProvider --- public/scripts/extensions/tts/index.js | 2 + public/scripts/extensions/tts/sbvits2.js | 338 +++++++++++++++++++++++ 2 files changed, 340 insertions(+) create mode 100644 public/scripts/extensions/tts/sbvits2.js diff --git a/public/scripts/extensions/tts/index.js b/public/scripts/extensions/tts/index.js index 1ac1edd8b..c54458efe 100644 --- a/public/scripts/extensions/tts/index.js +++ b/public/scripts/extensions/tts/index.js @@ -11,6 +11,7 @@ import { power_user } from '../../power-user.js'; import { OpenAITtsProvider } from './openai.js'; import { XTTSTtsProvider } from './xtts.js'; import { GSVITtsProvider } from './gsvi.js'; +import { SBVits2TtsProvider } from './sbvits2.js'; import { AllTalkTtsProvider } from './alltalk.js'; import { SpeechT5TtsProvider } from './speecht5.js'; import { AzureTtsProvider } from './azure.js'; @@ -77,6 +78,7 @@ const ttsProviders = { Silero: SileroTtsProvider, XTTSv2: XTTSTtsProvider, GSVI: GSVITtsProvider, + SBVits2: SBVits2TtsProvider, System: SystemTtsProvider, Coqui: CoquiTtsProvider, Edge: EdgeTtsProvider, diff --git a/public/scripts/extensions/tts/sbvits2.js b/public/scripts/extensions/tts/sbvits2.js new file mode 100644 index 000000000..49d0ef881 --- /dev/null +++ b/public/scripts/extensions/tts/sbvits2.js @@ -0,0 +1,338 @@ +import { getPreviewString, saveTtsProviderSettings } from './index.js'; + +export { SBVits2TtsProvider }; + +class SBVits2TtsProvider { + //########// + // Config // + //########// + + settings; + ready = false; + voices = []; + separator = '. '; + audioElement = document.createElement('audio'); + + /** + * Perform any text processing before passing to TTS engine. + * @param {string} text Input text + * @returns {string} Processed text + */ + processText(text) { + return text; + } + + languageLabels = { + 'Chinese': 'ZH', + 'English': 'EN', + 'Japanese': 'JP', + }; + + langKey2LangCode = { + 'ZH': 'zh-CN', + 'EN': 'en-US', + 'JP': 'ja-JP', + }; + + defaultSettings = { + provider_endpoint: 'http://localhost:5000', + sdp_ratio: 0.2, + noise: 0.6, + noisew: 0.8, + length: 1, + language: 'JP', + auto_split: true, + split_interval: 0.5, + assist_text: '', + assist_text_weight: 1, + style: 'Neutral', + style_weight: 1, + reference_audio_path: '', + }; + + get settingsHtml() { + let html = ` + + + SBVits2 Settings:
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + `; + + return html; + } + + onSettingsChange() { + // Used when provider settings are updated from UI + this.settings.provider_endpoint = $('#sbvits_tts_endpoint').val(); + this.settings.language = $('#sbvits_api_language').val(); + this.settings.assist_text = $('#sbvits_assist_text').val(); + this.settings.reference_audio_path = $('#sbvits_reference_audio_path').val(); + + // Update the default TTS settings based on input fields + this.settings.sdp_ratio = $('#sbvits_sdp_ratio').val(); + this.settings.noise = $('#sbvits_noise').val(); + this.settings.noisew = $('#sbvits_noisew').val(); + this.settings.length = $('#sbvits_length').val(); + this.settings.auto_split = $('#sbvits_auto_split').is(':checked'); + this.settings.split_interval = $('#sbvits_split_interval').val(); + this.settings.assist_text_weight = $('#sbvits_assist_text_weight').val(); + this.settings.style_weight = $('#sbvits_style_weight').val(); + + // Update the UI to reflect changes + $('#sbvits_sdp_ratio_output').text(this.settings.sdp_ratio); + $('#sbvits_noise_output').text(this.settings.noise); + $('#sbvits_noisew_output').text(this.settings.noisew); + $('#sbvits_length_output').text(this.settings.length); + $('#sbvits_split_interval_output').text(this.settings.split_interval); + $('#sbvits_assist_text_weight_output').text(this.settings.assist_text_weight); + $('#sbvits_style_weight_output').text(this.settings.style_weight); + + saveTtsProviderSettings(); + this.changeTTSSettings(); + } + + async loadSettings(settings) { + // Pupulate Provider UI given input settings + if (Object.keys(settings).length == 0) { + console.info('Using default TTS Provider settings'); + } + + // Only accept keys defined in defaultSettings + this.settings = this.defaultSettings; + + for (const key in settings) { + if (key in this.settings) { + this.settings[key] = settings[key]; + } else { + console.debug(`Ignoring non-user-configurable setting: ${key}`); + } + } + + // Set initial values from the settings + $('#sbvits_tts_endpoint').val(this.settings.provider_endpoint); + $('#sbvits_api_language').val(this.settings.language); + $('#sbvits_assist_text').val(this.settings.assist_text); + $('#sbvits_reference_audio_path').val(this.settings.reference_audio_path); + $('#sbvits_sdp_ratio').val(this.settings.sdp_ratio); + $('#sbvits_noise').val(this.settings.noise); + $('#sbvits_noisew').val(this.settings.noisew); + $('#sbvits_length').val(this.settings.length); + $('#sbvits_auto_split').prop('checked', this.settings.auto_split); + $('#sbvits_split_interval').val(this.settings.split_interval); + $('#sbvits_assist_text_weight').val(this.settings.assist_text_weight); + $('#sbvits_style_weight').val(this.settings.style_weight); + + // Update the UI to reflect changes + $('#sbvits_sdp_ratio_output').text(this.settings.sdp_ratio); + $('#sbvits_noise_output').text(this.settings.noise); + $('#sbvits_noisew_output').text(this.settings.noisew); + $('#sbvits_length_output').text(this.settings.length); + $('#sbvits_split_interval_output').text(this.settings.split_interval); + $('#sbvits_assist_text_weight_output').text(this.settings.assist_text_weight); + $('#sbvits_style_weight_output').text(this.settings.style_weight); + + // Register input/change event listeners to update settings on user interaction + $('#sbvits_tts_endpoint').on('input', () => { this.onSettingsChange(); }); + $('#sbvits_api_language').on('change', () => { this.onSettingsChange(); }); + $('#sbvits_assist_text').on('input', () => { this.onSettingsChange(); }); + $('#sbvits_reference_audio_path').on('input', () => { this.onSettingsChange(); }); + $('#sbvits_sdp_ratio').on('change', () => { this.onSettingsChange(); }); + $('#sbvits_noise').on('change', () => { this.onSettingsChange(); }); + $('#sbvits_noisew').on('change', () => { this.onSettingsChange(); }); + $('#sbvits_length').on('change', () => { this.onSettingsChange(); }); + $('#sbvits_auto_split').on('change', () => { this.onSettingsChange(); }); + $('#sbvits_split_interval').on('change', () => { this.onSettingsChange(); }); + $('#sbvits_assist_text_weight').on('change', () => { this.onSettingsChange(); }); + $('#sbvits_style_weight').on('change', () => { this.onSettingsChange(); }); + + await this.checkReady(); + + console.info('SBVits2: Settings loaded'); + } + + // Perform a simple readiness check by trying to fetch voiceIds + async checkReady() { + await Promise.allSettled([this.fetchTtsVoiceObjects(), this.changeTTSSettings()]); + } + + async onRefreshClick() { + return; + } + + //#################// + // TTS Interfaces // + //#################// + + /** + * Get a voice from the TTS provider. + * @param {string} voiceName Voice name to get + * @returns {Promise} Voice object + */ + async getVoice(voiceName) { + if (this.voices.length == 0) { + this.voices = await this.fetchTtsVoiceObjects(); + } + const match = this.voices.filter( + v => v.name == voiceName, + )[0]; + if (!match) { + throw `TTS Voice name ${voiceName} not found`; + } + return match; + } + + async generateTts(text, voiceId) { + const response = await this.fetchTtsGeneration(text, voiceId); + return response; + } + + //###########// + // API CALLS // + //###########// + async fetchTtsVoiceObjects() { + const response = await fetch(`${this.settings.provider_endpoint}/models/info`); + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${await response.json()}`); + } + const data = await response.json(); + const voices = Object.keys(data).flatMap(key => { + const config = data[key]; + const spk2id = config.spk2id; + const style2id = config.style2id; + + return Object.entries(spk2id).flatMap(([speaker, speaker_id]) => { + return Object.entries(style2id).map(([style, styleId]) => { + return { + name: `${speaker} (${style})`, + voice_id: `${key}-${speaker_id}-${style}`, + preview_url: false + }; + }); + }); + }); + + this.voices = voices; // Assign to the class property + return voices; // Also return this list + } + + // Each time a parameter is changed, we change the configuration + async changeTTSSettings() { + } + + /** + * Fetch TTS generation from the API. + * @param {string} inputText Text to generate TTS for + * @param {string} voiceId Voice ID to use (model_id-speaker_id-style) + * @returns {Promise} Fetch response + */ + async fetchTtsGeneration(inputText, voiceId) { + console.info(`Generating new TTS for voice_id ${voiceId}`); + + const [model_id, speaker_id, style] = voiceId.split('-'); + const params = new URLSearchParams(); + params.append('text', inputText); + params.append('model_id', model_id); + params.append('speaker_id', speaker_id); + params.append('sdp_ratio', this.settings.sdp_ratio); + params.append('noise', this.settings.noise); + params.append('noisew', this.settings.noisew); + params.append('length', this.settings.length); + params.append('language', this.settings.language); + params.append('auto_split', this.settings.auto_split); + params.append('split_interval', this.settings.split_interval); + if (this.settings.assist_text) { + params.append('assist_text', this.settings.assist_text); + params.append('assist_text_weight', this.settings.assist_text_weight); + } + params.append('style', style); + params.append('style_weight', this.settings.style_weight); + if (this.settings.reference_audio_path) { + params.append('reference_audio_path', this.settings.reference_audio_path); + } + const url = `${this.settings.provider_endpoint}/voice?${params.toString()}`; + + const response = await fetch( + url, + { + method: 'POST', + headers: { + } + }, + ); + if (!response.ok) { + toastr.error(response.statusText, 'TTS Generation Failed'); + throw new Error(`HTTP ${response.status}: ${await response.text()}`); + } + return response; + } + + /** + * Preview TTS for a given voice ID. + * @param {string} id Voice ID + */ + async previewTtsVoice(id) { + this.audioElement.pause(); + this.audioElement.currentTime = 0; + const lang_code = this.langKey2LangCode[this.settings.lang] ?? 'ja-JP' + const text = getPreviewString(lang_code); + const response = await this.fetchTtsGeneration(text, id); + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${await response.text()}`); + } + + const audio = await response.blob(); + const url = URL.createObjectURL(audio); + this.audioElement.src = url; + this.audioElement.play(); + } + + // Interface not used + async fetchTtsFromHistory(history_item_id) { + return Promise.resolve(history_item_id); + } +}