Add system TTS provider to the extension

2025-06-05 21:59:27 +02:00 · 2023-05-04 23:32:10 +03:00
parent 0d1ce6fd9a
commit d3165ba69b
3 changed files with 151 additions and 3 deletions
--- a/public/scripts/extensions/tts/index.js
+++ b/public/scripts/extensions/tts/index.js
@ -3,6 +3,7 @@ import { extension_settings, getContext } from '../../extensions.js'
 import { getStringHash } from '../../utils.js'
 import { ElevenLabsTtsProvider } from './elevenlabs.js'
 import { SileroTtsProvider } from './silerotts.js'
+import { SystemTtsProvider } from './system.js'

 const UPDATE_INTERVAL = 1000

@ -17,7 +18,8 @@ let lastMessageHash = null

 let ttsProviders = {
    ElevenLabs: ElevenLabsTtsProvider,
-    Silero: SileroTtsProvider
+    Silero: SileroTtsProvider,
+    System: SystemTtsProvider,
 }
 let ttsProvider
 let ttsProviderName
@ -112,7 +114,13 @@ async function playAudioData(audioBlob) {

 window['tts_preview'] = function (id) {
    const audio = document.getElementById(id)
-    audio.play()
+
+    if (!audio.hidden) {
+        audio.play()
+    }
+    else {
+        ttsProvider.previewTtsVoice(id)
+    }
 }

 async function onTtsVoicesClick() {
@ -123,7 +131,7 @@ async function onTtsVoicesClick() {

        for (const voice of voiceIds) {
            popupText += `<div class="voice_preview"><b>${voice.name}</b> <i onclick="tts_preview('${voice.voice_id}')" class="fa-solid fa-play"></i></div>`
-            popupText += `<audio id="${voice.voice_id}" src="${voice.preview_url}"></audio>`
+            popupText += `<audio id="${voice.voice_id}" src="${voice.preview_url}" hidden="${!!voice.preview_url}"></audio>`
        }
    } catch {
        popupText = 'Could not load voices list. Check your API key.'
--- a/public/scripts/extensions/tts/system.js
+++ b/public/scripts/extensions/tts/system.js
@ -0,0 +1,140 @@
+export { SystemTtsProvider }
+
+class SystemTtsProvider {
+    //########//
+    // Config //
+    //########//
+
+    previewStrings = {
+        'en-US': 'The quick brown fox jumps over the lazy dog',
+        'en-GB': 'Sphinx of black quartz, judge my vow',
+        'fr-FR': 'Portez ce vieux whisky au juge blond qui fume',
+        'de-DE': 'Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich',
+        'it-IT': "Pranzo d'acqua fa volti sghembi",
+        'es-ES': 'Quiere la boca exhausta vid, kiwi, piña y fugaz jamón',
+        'es-MX': 'Fabio me exige, sin tapujos, que añada cerveza al whisky',
+        'ru-RU': 'В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!',
+        'pt-BR': 'Vejo xá gritando que fez show sem playback.',
+        'pt-PR': 'Todo pajé vulgar faz boquinha sexy com kiwi.',
+        'uk-UA': "Фабрикуймо гідність, лящім їжею, ґав хапаймо, з'єднавці чаш!",
+    }
+    fallbackPreview = 'Neque porro quisquam est qui dolorem ipsum quia dolor sit amet'
+    settings
+    voices = []
+
+    defaultSettings = {
+        voiceMap: {},
+        rate: 1,
+        pitch: 1,
+    }
+
+    get settingsHtml() {
+        if (!window.speechSynthesis) {
+            return "Your browser or operating system doesn't support speech synthesis";
+        }
+
+        return `<p>Uses the voices provided by your operating system</p>
+        <label for="system_tts_rate">Rate: <span id="system_tts_rate_output"></span></label>
+        <input id="system_tts_rate" type="range" value="${this.defaultSettings.rate}" min="0.5" max="2" step="0.1" />
+        <label for="system_tts_pitch">Pitch: <span id="system_tts_pitch_output"></span></label>
+        <input id="system_tts_pitch" type="range" value="${this.defaultSettings.pitch}" min="0" max="2" step="0.1" />`;
+    }
+
+    onSettingsChange() {
+        this.settings.rate = Number($('#system_tts_rate').val());
+        this.settings.pitch = Number($('#system_tts_pitch').val());
+        $('#system_tts_pitch_output').text(this.settings.pitch);
+        $('#system_tts_rate_output').text(this.settings.rate);
+        console.log('Save changes');
+    }
+
+    loadSettings(settings) {
+        // Populate Provider UI given input settings
+        if (Object.keys(settings).length == 0) {
+            console.info("Using default TTS Provider settings");
+        }
+
+        // Only accept keys defined in defaultSettings
+        this.settings = this.defaultSettings;
+
+        for (const key in settings) {
+            if (key in this.settings) {
+                this.settings[key] = settings[key];
+            } else {
+                throw `Invalid setting passed to TTS Provider: ${key}`;
+            }
+        }
+
+        $('#system_tts_rate').val(this.settings.rate || this.defaultSettings.rate);
+        $('#system_tts_pitch').val(this.settings.pitch || this.defaultSettings.pitch);
+        $('#system_tts_pitch_output').text(this.settings.pitch);
+        $('#system_tts_rate_output').text(this.settings.rate);
+        console.info("Settings loaded");
+    }
+
+    async onApplyClick() {
+        return
+    }
+
+    //#################//
+    //  TTS Interfaces //
+    //#################//
+    fetchTtsVoiceIds() {
+        if (!window.speechSynthesis) {
+            return [];
+        }
+
+        return speechSynthesis.getVoices().map(x => ({ name: x.name, voice_id: x.voiceURI, preview_url: '' }));
+    }
+
+    previewTtsVoice(voiceId) {
+        const voice = speechSynthesis.getVoices().find(x => x.voiceURI === voiceId);
+
+        if (!voice) {
+            throw `TTS Voice name ${voiceName} not found`
+        }
+
+        speechSynthesis.cancel();
+        const text = this.previewStrings[voice.lang] ?? this.fallbackPreview;
+        const utterance = new SpeechSynthesisUtterance(text);
+        utterance.voice = voice;
+        utterance.rate = 1;
+        utterance.pitch = 1;
+        speechSynthesis.speak(utterance);
+    }
+
+    async getVoice(voiceName) {
+        if (!window.speechSynthesis) {
+            return { voice_id: null }
+        }
+
+        const voices = window.speechSynthesis.getVoices();
+        const match = voices.find(x => x.name == voiceName);
+
+        if (!match) {
+            throw `TTS Voice name ${voiceName} not found`
+        }
+
+        return { voice_id: match.voiceURI, name: match.name };
+    }
+
+    async generateTts(text, voiceId) {
+        if (!window.speechSynthesis) {
+            throw 'Speech synthesis API is not supported';
+        }
+
+        const silence = await fetch('/sounds/silence.mp3');
+
+        return new Promise((resolve, reject) => {
+            const voices = speechSynthesis.getVoices();
+            const voice = voices.find(x => x.voiceURI === voiceId);
+            const utterance = new SpeechSynthesisUtterance(text);
+            utterance.voice = voice;
+            utterance.rate = this.settings.rate || 1;
+            utterance.pitch = this.settings.pitch || 1;
+            utterance.onend = () => resolve(silence);
+            utterance.onerror = () => reject();
+            speechSynthesis.speak(utterance);
+        });
+    }
+}
--- a/public/sounds/silence.mp3
+++ b/public/sounds/silence.mp3