Add support for XTTS streaming

2025-06-05 21:59:27 +02:00 · 2024-01-02 07:04:32 +02:00
parent 9b24e7dc67
commit 99244a0c11
2 changed files with 34 additions and 12 deletions
--- a/public/scripts/extensions/tts/index.js
+++ b/public/scripts/extensions/tts/index.js
@@ -1,6 +1,6 @@
 import { callPopup, cancelTtsPlay, eventSource, event_types, name2, saveSettingsDebounced } from '../../../script.js';
 import { ModuleWorkerWrapper, doExtrasFetch, extension_settings, getApiUrl, getContext, modules } from '../../extensions.js';
-import { delay, escapeRegex, getStringHash, onlyUnique } from '../../utils.js';
+import { delay, escapeRegex, getBase64Async, getStringHash, onlyUnique } from '../../utils.js';
 import { EdgeTtsProvider } from './edge.js';
 import { ElevenLabsTtsProvider } from './elevenlabs.js';
 import { SileroTtsProvider } from './silerotts.js';
@@ -316,12 +316,14 @@ async function playAudioData(audioBlob) {
    if (currentAudioJob == null) {
        console.log('Cancelled TTS playback because currentAudioJob was null');
    }
-    const reader = new FileReader();
+    if (audioBlob instanceof Blob) {
-    reader.onload = function (e) {
+        const srcUrl = await getBase64Async(audioBlob);
        const srcUrl = e.target.result;
        audioElement.src = srcUrl;
-    };
+    } else if (typeof audioBlob === 'string') {
-    reader.readAsDataURL(audioBlob);
+        audioElement.src = audioBlob;
    } else {
        throw `TTS received invalid audio data type ${typeof audioBlob}`;
    }
    audioElement.addEventListener('ended', completeCurrentAudioJob);
    audioElement.addEventListener('canplay', () => {
        console.debug('Starting TTS playback');
@@ -417,11 +419,15 @@ function completeCurrentAudioJob() {
 * @param {Response} response
 */
 async function addAudioJob(response) {
-    const audioData = await response.blob();
+    if (typeof response === 'string') {
-    if (!audioData.type.startsWith('audio/')) {
+        audioJobQueue.push(response);
-        throw `TTS received HTTP response with invalid data format. Expecting audio/*, got ${audioData.type}`;
+    } else {
        const audioData = await response.blob();
        if (!audioData.type.startsWith('audio/')) {
            throw `TTS received HTTP response with invalid data format. Expecting audio/*, got ${audioData.type}`;
        }
        audioJobQueue.push(audioData);
    }
    audioJobQueue.push(audioData);
    console.debug('Pushed audio job to queue.');
 }
@@ -432,7 +438,7 @@ async function processAudioJobQueue() {
    }
    try {
        audioQueueProcessorReady = false;
-        currentAudioJob = audioJobQueue.pop();
+        currentAudioJob = audioJobQueue.shift();
        playAudioData(currentAudioJob);
        talkingAnimation(true);
    } catch (error) {
--- a/public/scripts/extensions/tts/xtts.js
+++ b/public/scripts/extensions/tts/xtts.js
@@ -52,6 +52,7 @@ class XTTSTtsProvider {
        provider_endpoint: 'http://localhost:8020',
        language: 'en',
        voiceMap: {},
        streaming: false,
    };
    get settingsHtml() {
@@ -75,7 +76,10 @@ class XTTSTtsProvider {
        </select>
        <label for="xtts_tts_endpoint">Provider Endpoint:</label>
        <input id="xtts_tts_endpoint" type="text" class="text_pole" maxlength="250" value="${this.defaultSettings.provider_endpoint}"/>
-
+        <label for="xtts_tts_streaming" class="checkbox_label">
            <input id="xtts_tts_streaming" type="checkbox" />
            <span>Streaming <small>(RVC not supported)</small></span>
        </label>
        `;
        html += `
@@ -90,6 +94,7 @@ class XTTSTtsProvider {
        // Used when provider settings are updated from UI
        this.settings.provider_endpoint = $('#xtts_tts_endpoint').val();
        this.settings.language = $('#xtts_api_language').val();
        this.settings.streaming = $('#xtts_tts_streaming').is(':checked');
        saveTtsProviderSettings();
    }
@@ -125,6 +130,8 @@ class XTTSTtsProvider {
        $('#xtts_tts_endpoint').on('input', () => { this.onSettingsChange(); });
        $('#xtts_api_language').val(this.settings.language);
        $('#xtts_api_language').on('change', () => { this.onSettingsChange(); });
        $('#xtts_tts_streaming').prop('checked', this.settings.streaming);
        $('#xtts_tts_streaming').on('change', () => { this.onSettingsChange(); });
        await this.checkReady();
@@ -176,6 +183,15 @@ class XTTSTtsProvider {
    async fetchTtsGeneration(inputText, voiceId) {
        console.info(`Generating new TTS for voice_id ${voiceId}`);
        if (this.settings.streaming) {
            const params = new URLSearchParams();
            params.append('text', inputText);
            params.append('speaker_wav', voiceId);
            params.append('language', this.settings.language);
            return `${this.settings.provider_endpoint}/tts_stream/?${params.toString()}`;
        }
        const response = await doExtrasFetch(
            `${this.settings.provider_endpoint}/tts_to_audio/`,
            {