SillyTavern/public/scripts/extensions/tts/system.js

import { isMobile } from '../../RossAscends-mods.js';
import { getPreviewString } from './index.js';
import { saveTtsProviderSettings } from './index.js';
export { SystemTtsProvider };

/**
 * Chunkify
 * Google Chrome Speech Synthesis Chunking Pattern
 * Fixes inconsistencies with speaking long texts in speechUtterance objects
 * Licensed under the MIT License
 *
 * Peter Woolley and Brett Zamir
 * Modified by Haaris for bug fixes
 */

var speechUtteranceChunker = function (utt, settings, callback) {
    settings = settings || {};
    var newUtt;
    var txt = (settings && settings.offset !== undefined ? utt.text.substring(settings.offset) : utt.text);
    if (utt.voice && utt.voice.voiceURI === 'native') { // Not part of the spec
        newUtt = utt;
        newUtt.text = txt;
        newUtt.addEventListener('end', function () {
            if (speechUtteranceChunker.cancel) {
                speechUtteranceChunker.cancel = false;
            }
            if (callback !== undefined) {
                callback();
            }
        });
    }
    else {
        var chunkLength = (settings && settings.chunkLength) || 160;
        var pattRegex = new RegExp('^[\\s\\S]{' + Math.floor(chunkLength / 2) + ',' + chunkLength + '}[.!?,]{1}|^[\\s\\S]{1,' + chunkLength + '}$|^[\\s\\S]{1,' + chunkLength + '} ');
        var chunkArr = txt.match(pattRegex);

        if (chunkArr == null || chunkArr[0] === undefined || chunkArr[0].length <= 2) {
            //call once all text has been spoken...
            if (callback !== undefined) {
                callback();
            }
            return;
        }
        var chunk = chunkArr[0];
        newUtt = new SpeechSynthesisUtterance(chunk);
        var x;
        for (x in utt) {
            if (Object.hasOwn(utt, x) && x !== 'text') {
                newUtt[x] = utt[x];
            }
        }
        newUtt.lang = utt.lang;
        newUtt.voice = utt.voice;
        newUtt.rate = utt.rate;
        newUtt.pitch = utt.pitch;
        newUtt.addEventListener('end', function () {
            if (speechUtteranceChunker.cancel) {
                speechUtteranceChunker.cancel = false;
                return;
            }
            settings.offset = settings.offset || 0;
            settings.offset += chunk.length;
            speechUtteranceChunker(utt, settings, callback);
        });
    }

    if (settings.modifier) {
        settings.modifier(newUtt);
    }
    console.log(newUtt); //IMPORTANT!! Do not remove: Logging the object out fixes some onend firing issues.
    //placing the speak invocation inside a callback fixes ordering and onend issues.
    setTimeout(function () {
        speechSynthesis.speak(newUtt);
    }, 0);
};

class SystemTtsProvider {
    //########//
    // Config //
    //########//

    // Static constants for the simulated default voice
    static BROWSER_DEFAULT_VOICE_ID = '__browser_default__';
    static BROWSER_DEFAULT_VOICE_NAME = 'System Default Voice';

    settings;
    ready = false;
    voices = [];
    separator = ' ... ';

    defaultSettings = {
        voiceMap: {},
        rate: 1,
        pitch: 1,
    };

    get settingsHtml() {
        if (!('speechSynthesis' in window)) {
            return 'Your browser or operating system doesn\'t support speech synthesis';
        }

        return `<p>Uses the voices provided by your operating system</p>
        <label for="system_tts_rate">Rate: <span id="system_tts_rate_output"></span></label>
        <input id="system_tts_rate" type="range" value="${this.defaultSettings.rate}" min="0.1" max="2" step="0.01" />
        <label for="system_tts_pitch">Pitch: <span id="system_tts_pitch_output"></span></label>
        <input id="system_tts_pitch" type="range" value="${this.defaultSettings.pitch}" min="0" max="2" step="0.01" />`;
    }

    onSettingsChange() {
        this.settings.rate = Number($('#system_tts_rate').val());
        this.settings.pitch = Number($('#system_tts_pitch').val());
        $('#system_tts_pitch_output').text(this.settings.pitch);
        $('#system_tts_rate_output').text(this.settings.rate);
        saveTtsProviderSettings();
    }

    async loadSettings(settings) {
        // Populate Provider UI given input settings
        if (Object.keys(settings).length == 0) {
            console.info('Using default TTS Provider settings');
        }

        // iOS should only allows speech synthesis trigged by user interaction
        if (isMobile()) {
            let hasEnabledVoice = false;

            document.addEventListener('click', () => {
                if (hasEnabledVoice) {
                    return;
                }
                const utterance = new SpeechSynthesisUtterance(' . ');
                utterance.volume = 0;
                speechSynthesis.speak(utterance);
                hasEnabledVoice = true;
            });
        }

        // Only accept keys defined in defaultSettings
        this.settings = this.defaultSettings;

        for (const key in settings) {
            if (key in this.settings) {
                this.settings[key] = settings[key];
            } else {
                throw `Invalid setting passed to TTS Provider: ${key}`;
            }
        }

        $('#system_tts_rate').val(this.settings.rate || this.defaultSettings.rate);
        $('#system_tts_pitch').val(this.settings.pitch || this.defaultSettings.pitch);

        // Trigger updates
        $('#system_tts_rate').on('input', () => { this.onSettingsChange(); });
        $('#system_tts_pitch').on('input', () => { this.onSettingsChange(); });

        $('#system_tts_pitch_output').text(this.settings.pitch);
        $('#system_tts_rate_output').text(this.settings.rate);
        console.debug('SystemTTS: Settings loaded');
    }

    // Perform a simple readiness check by trying to fetch voiceIds
    async checkReady() {
        await this.fetchTtsVoiceObjects();
    }

    async onRefreshClick() {
        return;
    }

    //#################//
    //  TTS Interfaces //
    //#################//
    fetchTtsVoiceObjects() {
        if (!('speechSynthesis' in window)) {
            // Browser doesn't support speech synthesis
            return Promise.resolve([]);
        }

        return new Promise((resolve) => {
            // Use a minimal timeout to allow the voice list to potentially populate
            setTimeout(() => {
                let voices = speechSynthesis.getVoices();

                if (voices.length === 0) {
                    // If no voices returned (e.g., Edge on first load), provide a default option
                    console.warn('SystemTTS: getVoices() returned empty list. Providing browser default option.');
                    const defaultVoice = {
                        name: SystemTtsProvider.BROWSER_DEFAULT_VOICE_NAME,
                        voice_id: SystemTtsProvider.BROWSER_DEFAULT_VOICE_ID,
                        preview_url: false,
                        // Try to guess the browser's default language
                        lang: navigator.language || 'en-US',
                    };
                    resolve([defaultVoice]);
                } else {
                    // If voices are available, map them as before
                    const mappedVoices = voices
                        .sort((a, b) => a.lang.localeCompare(b.lang) || a.name.localeCompare(b.name))
                        .map(x => ({ name: x.name, voice_id: x.voiceURI, preview_url: false, lang: x.lang }));
                    resolve(mappedVoices);
                }
            }, 50); // Increased timeout slightly just in case it helps voice population on some browsers
        });
    }


    previewTtsVoice(voiceId) {
        if (!('speechSynthesis' in window)) {
            throw new Error('Speech synthesis API is not supported'); // Keep Error type for consistency
        }

        let voice = null;
        // Check if the requested voice is NOT the browser default
        if (voiceId !== SystemTtsProvider.BROWSER_DEFAULT_VOICE_ID) {
            const voices = speechSynthesis.getVoices();
            // Try to find the actual voice
            voice = voices.find(x => x.voiceURI === voiceId);

            if (!voice && voices.length > 0) {
                // If voices are loaded but the specific ID wasn't found, log a warning
                console.warn(`SystemTTS Preview: Voice ID "${voiceId}" not found among available voices. Using browser default.`);
                // Fallback to default (voice remains null)
            } else if (!voice && voices.length === 0) {
                // If no voices are loaded at all, we expect to use default
                console.warn('SystemTTS Preview: Voice list is empty. Using browser default.');
                // Fallback to default (voice remains null)
            }
        } else {
            console.log('SystemTTS Preview: Using browser default voice as requested.');
            // Use default (voice remains null)
        }

        speechSynthesis.cancel(); // Stop any previous speech
        // Use the language from the found voice if available, otherwise default to 'en-US' or browser lang for the preview text
        const langForPreview = voice ? voice.lang : (navigator.language || 'en-US');
        const text = getPreviewString(langForPreview);
        const utterance = new SpeechSynthesisUtterance(text);

        // Only set the voice if we found a specific one and it wasn't the default request
        if (voice) {
            utterance.voice = voice;
        }
        // Otherwise, utterance.voice remains null/undefined, causing the browser to use its default

        utterance.rate = this.settings.rate || 1;
        utterance.pitch = this.settings.pitch || 1;

        // Add error handling for the speech itself
        utterance.onerror = (event) => {
            console.error(`SystemTTS Preview Error: ${event.error}`, event);
            // Potentially notify the user here
        };

        speechSynthesis.speak(utterance);
    }

    async getVoice(voiceName) {
        if (!('speechSynthesis' in window)) {
            // Return a predictable null-like structure if API not supported
            return { voice_id: null, name: 'API Not Supported' };
        }

        // Check if the requested name is the browser default placeholder
        if (voiceName === SystemTtsProvider.BROWSER_DEFAULT_VOICE_NAME) {
            return {
                voice_id: SystemTtsProvider.BROWSER_DEFAULT_VOICE_ID,
                name: SystemTtsProvider.BROWSER_DEFAULT_VOICE_NAME,
            };
        }

        // Attempt to get voices, might be async
        // Note: This relies on voices potentially being populated by now.
        // A more robust approach might involve re-calling fetchTtsVoiceObjects if needed,
        // but sticking to minimal changes based on original code structure.
        const voices = speechSynthesis.getVoices();

        if (voices.length === 0) {
            // If voices are still empty, we can't find any specific name
            console.warn(`SystemTTS getVoice: Voice list empty, cannot find "${voiceName}". Falling back to browser default ID.`);
            // Return the default placeholder as a fallback in this edge case
            return {
                voice_id: SystemTtsProvider.BROWSER_DEFAULT_VOICE_ID,
                name: SystemTtsProvider.BROWSER_DEFAULT_VOICE_NAME,
            };
        }

        const match = voices.find(x => x.name == voiceName);

        if (!match) {
            // If voices are loaded but name not found, throw error as before
            throw new Error(`SystemTTS getVoice: TTS Voice name "${voiceName}" not found`);
        }

        return { voice_id: match.voiceURI, name: match.name };
    }

    async generateTts(text, voiceId) {
        if (!('speechSynthesis' in window)) {
            throw 'Speech synthesis API is not supported';
        }

        const silence = await fetch('/sounds/silence.mp3');

        return new Promise((resolve, reject) => {
            const voices = speechSynthesis.getVoices();
            const voice = voices.find(x => x.voiceURI === voiceId);
            const utterance = new SpeechSynthesisUtterance(text);
            utterance.voice = voice;
            utterance.rate = this.settings.rate || 1;
            utterance.pitch = this.settings.pitch || 1;
            utterance.onend = () => resolve(silence);
            utterance.onerror = () => reject();
            speechUtteranceChunker(utterance, {
                chunkLength: 200,
            }, function () {
                //some code to execute when done
                resolve(silence);
                console.log('System TTS done');
            });
        });
    }
}