import { callPopup, cancelTtsPlay, eventSource, event_types, saveSettingsDebounced } from '../../../script.js' import { ModuleWorkerWrapper, doExtrasFetch, extension_settings, getApiUrl, getContext, modules } from '../../extensions.js' import { escapeRegex, getStringHash } from '../../utils.js' import { EdgeTtsProvider } from './edge.js' import { ElevenLabsTtsProvider } from './elevenlabs.js' import { SileroTtsProvider } from './silerotts.js' import { CoquiTtsProvider } from './coqui.js' import { SystemTtsProvider } from './system.js' import { NovelTtsProvider } from './novel.js' import { power_user } from '../../power-user.js' import { rvcVoiceConversion } from "../rvc/index.js" export { talkingAnimation }; const UPDATE_INTERVAL = 1000 let voiceMapEntries = [] let voiceMap = {} // {charName:voiceid, charName2:voiceid2} let audioControl let storedvalue = false; let lastCharacterId = null let lastGroupId = null let lastChatId = null let lastMessageHash = null const DEFAULT_VOICE_MARKER = '[Default Voice]'; const DISABLED_VOICE_MARKER = 'disabled'; export function getPreviewString(lang) { const previewStrings = { 'en-US': 'The quick brown fox jumps over the lazy dog', 'en-GB': 'Sphinx of black quartz, judge my vow', 'fr-FR': 'Portez ce vieux whisky au juge blond qui fume', 'de-DE': 'Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich', 'it-IT': "Pranzo d'acqua fa volti sghembi", 'es-ES': 'Quiere la boca exhausta vid, kiwi, piña y fugaz jamón', 'es-MX': 'Fabio me exige, sin tapujos, que añada cerveza al whisky', 'ru-RU': 'В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!', 'pt-BR': 'Vejo xá gritando que fez show sem playback.', 'pt-PR': 'Todo pajé vulgar faz boquinha sexy com kiwi.', 'uk-UA': "Фабрикуймо гідність, лящім їжею, ґав хапаймо, з'єднавці чаш!", 'pl-PL': 'Pchnąć w tę łódź jeża lub ośm skrzyń fig', 'cs-CZ': 'Příliš žluťoučký kůň úpěl ďábelské ódy', 'sk-SK': 'Vyhŕňme si rukávy a vyprážajme čínske ryžové cestoviny', 'hu-HU': 'Árvíztűrő tükörfúrógép', 'tr-TR': 'Pijamalı hasta yağız şoföre çabucak güvendi', 'nl-NL': 'De waard heeft een kalfje en een pinkje opgegeten', 'sv-SE': 'Yxskaftbud, ge vårbygd, zinkqvarn', 'da-DK': 'Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen Walther spillede på xylofon', 'ja-JP': 'いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす', 'ko-KR': '가나다라마바사아자차카타파하', 'zh-CN': '我能吞下玻璃而不伤身体', 'ro-RO': 'Muzicologă în bej vând whisky și tequila, preț fix', 'bg-BG': 'Щъркелите се разпръснаха по цялото небе', 'el-GR': 'Ταχίστη αλώπηξ βαφής ψημένη γη, δρασκελίζει υπέρ νωθρού κυνός', 'fi-FI': 'Voi veljet, miksi juuri teille myin nämä vehkeet?', 'he-IL': 'הקצינים צעקו: "כל הכבוד לצבא הצבאות!"', 'id-ID': 'Jangkrik itu memang enak, apalagi kalau digoreng', 'ms-MY': 'Muzik penyanyi wanita itu menggambarkan kehidupan yang penuh dengan duka nestapa', 'th-TH': 'เป็นไงบ้างครับ ผมชอบกินข้าวผัดกระเพราหมูกรอบ', 'vi-VN': 'Cô bé quàng khăn đỏ đang ngồi trên bãi cỏ xanh', 'ar-SA': 'أَبْجَدِيَّة عَرَبِيَّة', 'hi-IN': 'श्वेता ने श्वेता के श्वेते हाथों में श्वेता का श्वेता चावल पकड़ा', } const fallbackPreview = 'Neque porro quisquam est qui dolorem ipsum quia dolor sit amet' return previewStrings[lang] ?? fallbackPreview; } let ttsProviders = { ElevenLabs: ElevenLabsTtsProvider, Silero: SileroTtsProvider, System: SystemTtsProvider, Coqui: CoquiTtsProvider, Edge: EdgeTtsProvider, Novel: NovelTtsProvider, } let ttsProvider let ttsProviderName async function onNarrateOneMessage() { audioElement.src = '/sounds/silence.mp3'; const context = getContext(); const id = $(this).closest('.mes').attr('mesid'); const message = context.chat[id]; if (!message) { return; } resetTtsPlayback() ttsJobQueue.push(message); moduleWorker(); } async function moduleWorker() { // Primarily determining when to add new chat to the TTS queue const enabled = $('#tts_enabled').is(':checked') $('body').toggleClass('tts', enabled); if (!enabled) { return } const context = getContext() const chat = context.chat processTtsQueue() processAudioJobQueue() updateUiAudioPlayState() // Auto generation is disabled if (extension_settings.tts.auto_generation == false) { return } // no characters or group selected if (!context.groupId && context.characterId === undefined) { return } // Chat changed if ( context.chatId !== lastChatId ) { currentMessageNumber = context.chat.length ? context.chat.length : 0 saveLastValues() return } // take the count of messages let lastMessageNumber = context.chat.length ? context.chat.length : 0 // There's no new messages let diff = lastMessageNumber - currentMessageNumber let hashNew = getStringHash((chat.length && chat[chat.length - 1].mes) ?? '') if (diff == 0 && hashNew === lastMessageHash) { return } const message = chat[chat.length - 1] // We're currently swiping or streaming. Don't generate voice if ( !message || message.mes === '...' || message.mes === '' || (context.streamingProcessor && !context.streamingProcessor.isFinished) ) { return } // Don't generate if message doesn't have a display text if (extension_settings.tts.narrate_translated_only && !(message?.extra?.display_text)) { return; } // Don't generate if message is a user message and user message narration is disabled if (message.is_user && !extension_settings.tts.narrate_user) { return; } // New messages, add new chat to history lastMessageHash = hashNew currentMessageNumber = lastMessageNumber console.debug( `Adding message from ${message.name} for TTS processing: "${message.mes}"` ) ttsJobQueue.push(message) } function talkingAnimation(switchValue) { if (!modules.includes('talkinghead')) { console.debug("Talking Animation module not loaded"); return; } const apiUrl = getApiUrl(); const animationType = switchValue ? "start" : "stop"; if (switchValue !== storedvalue) { try { console.log(animationType + " Talking Animation"); doExtrasFetch(`${apiUrl}/api/talkinghead/${animationType}_talking`); storedvalue = switchValue; // Update the storedvalue to the current switchValue } catch (error) { // Handle the error here or simply ignore it to prevent logging } } updateUiAudioPlayState() } function resetTtsPlayback() { // Stop system TTS utterance cancelTtsPlay(); // Clear currently processing jobs currentTtsJob = null; currentAudioJob = null; // Reset audio element audioElement.currentTime = 0; audioElement.src = ''; // Clear any queue items ttsJobQueue.splice(0, ttsJobQueue.length); audioJobQueue.splice(0, audioJobQueue.length); // Set audio ready to process again audioQueueProcessorReady = true; } function isTtsProcessing() { let processing = false // Check job queues if (ttsJobQueue.length > 0 || audioJobQueue.length > 0) { processing = true } // Check current jobs if (currentTtsJob != null || currentAudioJob != null) { processing = true } return processing } function debugTtsPlayback() { console.log(JSON.stringify( { "ttsProviderName": ttsProviderName, "voiceMap": voiceMap, "currentMessageNumber": currentMessageNumber, "audioPaused": audioPaused, "audioJobQueue": audioJobQueue, "currentAudioJob": currentAudioJob, "audioQueueProcessorReady": audioQueueProcessorReady, "ttsJobQueue": ttsJobQueue, "currentTtsJob": currentTtsJob, "ttsConfig": extension_settings.tts } )) } window.debugTtsPlayback = debugTtsPlayback //##################// // Audio Control // //##################// let audioElement = new Audio() audioElement.autoplay = true let audioJobQueue = [] let currentAudioJob let audioPaused = false let audioQueueProcessorReady = true let lastAudioPosition = 0 async function playAudioData(audioBlob) { // Since current audio job can be cancelled, don't playback if it is null if (currentAudioJob == null) { console.log("Cancelled TTS playback because currentAudioJob was null") } const reader = new FileReader() reader.onload = function (e) { const srcUrl = e.target.result audioElement.src = srcUrl } reader.readAsDataURL(audioBlob) audioElement.addEventListener('ended', completeCurrentAudioJob) audioElement.addEventListener('canplay', () => { console.debug(`Starting TTS playback`) audioElement.play() }) } window['tts_preview'] = function (id) { const audio = document.getElementById(id) if (audio && !$(audio).data('disabled')) { audio.play() } else { ttsProvider.previewTtsVoice(id) } } async function onTtsVoicesClick() { let popupText = '' try { const voiceIds = await ttsProvider.fetchTtsVoiceObjects() for (const voice of voiceIds) { popupText += `
${voice.lang || ''} ${voice.name}
` if (voice.preview_url) { popupText += `` } } } catch { popupText = 'Could not load voices list. Check your API key.' } callPopup(popupText, 'text') } function updateUiAudioPlayState() { if (extension_settings.tts.enabled == true) { $('#ttsExtensionMenuItem').show(); let img // Give user feedback that TTS is active by setting the stop icon if processing or playing if (!audioElement.paused || isTtsProcessing()) { img = 'fa-solid fa-stop-circle extensionsMenuExtensionButton' } else { img = 'fa-solid fa-circle-play extensionsMenuExtensionButton' } $('#tts_media_control').attr('class', img); } else { $('#ttsExtensionMenuItem').hide(); } } function onAudioControlClicked() { audioElement.src = '/sounds/silence.mp3'; let context = getContext() // Not pausing, doing a full stop to anything TTS is doing. Better UX as pause is not as useful if (!audioElement.paused || isTtsProcessing()) { resetTtsPlayback() talkingAnimation(false); } else { // Default play behavior if not processing or playing is to play the last message. ttsJobQueue.push(context.chat[context.chat.length - 1]) } updateUiAudioPlayState() } function addAudioControl() { $('#extensionsMenu').prepend(`
TTS Playback
`) $('#ttsExtensionMenuItem').attr('title', 'TTS play/pause').on('click', onAudioControlClicked) audioControl = document.getElementById('tts_media_control') updateUiAudioPlayState() } function completeCurrentAudioJob() { audioQueueProcessorReady = true currentAudioJob = null lastAudioPosition = 0 talkingAnimation(false) //stop lip animation // updateUiPlayState(); } /** * Accepts an HTTP response containing audio/mpeg data, and puts the data as a Blob() on the queue for playback * @param {*} response */ async function addAudioJob(response) { const audioData = await response.blob() if (!audioData.type in ['audio/mpeg', 'audio/wav', 'audio/x-wav', 'audio/wave', 'audio/webm']) { throw `TTS received HTTP response with invalid data format. Expecting audio/mpeg, got ${audioData.type}` } audioJobQueue.push(audioData) console.debug('Pushed audio job to queue.') } async function processAudioJobQueue() { // Nothing to do, audio not completed, or audio paused - stop processing. if (audioJobQueue.length == 0 || !audioQueueProcessorReady || audioPaused) { return } try { audioQueueProcessorReady = false currentAudioJob = audioJobQueue.pop() playAudioData(currentAudioJob) talkingAnimation(true) } catch (error) { console.error(error) audioQueueProcessorReady = true } } //################// // TTS Control // //################// let ttsJobQueue = [] let currentTtsJob // Null if nothing is currently being processed let currentMessageNumber = 0 function completeTtsJob() { console.info(`Current TTS job for ${currentTtsJob.name} completed.`) currentTtsJob = null } function saveLastValues() { const context = getContext() lastGroupId = context.groupId lastCharacterId = context.characterId lastChatId = context.chatId lastMessageHash = getStringHash( (context.chat.length && context.chat[context.chat.length - 1].mes) ?? '' ) } async function tts(text, voiceId, char) { let response = await ttsProvider.generateTts(text, voiceId) // RVC injection if (extension_settings.rvc.enabled) response = await rvcVoiceConversion(response, char, text) addAudioJob(response) completeTtsJob() } async function processTtsQueue() { // Called each moduleWorker iteration to pull chat messages from queue if (currentTtsJob || ttsJobQueue.length <= 0 || audioPaused) { return } console.debug('New message found, running TTS') currentTtsJob = ttsJobQueue.shift() let text = extension_settings.tts.narrate_translated_only ? currentTtsJob?.extra?.display_text : currentTtsJob.mes text = extension_settings.tts.narrate_dialogues_only ? text.replace(/\*[^\*]*?(\*|$)/g, '').trim() // remove asterisks content : text.replaceAll('*', '').trim() // remove just the asterisks if (extension_settings.tts.narrate_quoted_only) { const special_quotes = /[“”]/g; // Extend this regex to include other special quotes text = text.replace(special_quotes, '"'); const matches = text.match(/".*?"/g); // Matches text inside double quotes, non-greedily const partJoiner = (ttsProvider?.separator || ' ... '); text = matches ? matches.join(partJoiner) : text; } console.log(`TTS: ${text}`) const char = currentTtsJob.name // Remove character name from start of the line if power user setting is disabled if (char && !power_user.allow_name2_display) { const escapedChar = escapeRegex(char); text = text.replace(new RegExp(`^${escapedChar}:`, 'gm'), ''); } try { if (!text) { console.warn('Got empty text in TTS queue job.'); completeTtsJob() return; } const voiceMapEntry = voiceMap[char] === DEFAULT_VOICE_MARKER ? voiceMap[DEFAULT_VOICE_MARKER] : voiceMap[char] if (!voiceMapEntry || voiceMapEntry === DISABLED_VOICE_MARKER) { throw `${char} not in voicemap. Configure character in extension settings voice map` } const voice = await ttsProvider.getVoice(voiceMapEntry) const voiceId = voice.voice_id if (voiceId == null) { toastr.error(`Specified voice for ${char} was not found. Check the TTS extension settings.`) throw `Unable to attain voiceId for ${char}` } tts(text, voiceId, char) } catch (error) { console.error(error) currentTtsJob = null } } // Secret function for now async function playFullConversation() { const context = getContext() const chat = context.chat ttsJobQueue = chat } window.playFullConversation = playFullConversation //#############################// // Extension UI and Settings // //#############################// function loadSettings() { if (Object.keys(extension_settings.tts).length === 0) { Object.assign(extension_settings.tts, defaultSettings) } for (const key in defaultSettings) { if (!(key in extension_settings.tts)) { extension_settings.tts[key] = defaultSettings[key] } } $('#tts_provider').val(extension_settings.tts.currentProvider) $('#tts_enabled').prop( 'checked', extension_settings.tts.enabled ) $('#tts_narrate_dialogues').prop('checked', extension_settings.tts.narrate_dialogues_only) $('#tts_narrate_quoted').prop('checked', extension_settings.tts.narrate_quoted_only) $('#tts_auto_generation').prop('checked', extension_settings.tts.auto_generation) $('#tts_narrate_translated_only').prop('checked', extension_settings.tts.narrate_translated_only); $('#tts_narrate_user').prop('checked', extension_settings.tts.narrate_user); $('body').toggleClass('tts', extension_settings.tts.enabled); } const defaultSettings = { voiceMap: '', ttsEnabled: false, currentProvider: "ElevenLabs", auto_generation: true, narrate_user: false, } function setTtsStatus(status, success) { $('#tts_status').text(status) if (success) { $('#tts_status').removeAttr('style') } else { $('#tts_status').css('color', 'red') } } function onRefreshClick() { Promise.all([ ttsProvider.onRefreshClick(), // updateVoiceMap() ]).then(() => { extension_settings.tts[ttsProviderName] = ttsProvider.settings saveSettingsDebounced() setTtsStatus('Successfully applied settings', true) console.info(`Saved settings ${ttsProviderName} ${JSON.stringify(ttsProvider.settings)}`) initVoiceMap() updateVoiceMap() }).catch(error => { console.error(error) setTtsStatus(error, false) }) } function onEnableClick() { extension_settings.tts.enabled = $('#tts_enabled').is( ':checked' ) updateUiAudioPlayState() saveSettingsDebounced() } function onAutoGenerationClick() { extension_settings.tts.auto_generation = !!$('#tts_auto_generation').prop('checked'); saveSettingsDebounced() } function onNarrateDialoguesClick() { extension_settings.tts.narrate_dialogues_only = !!$('#tts_narrate_dialogues').prop('checked'); saveSettingsDebounced() } function onNarrateUserClick() { extension_settings.tts.narrate_user = !!$('#tts_narrate_user').prop('checked'); saveSettingsDebounced(); } function onNarrateQuotedClick() { extension_settings.tts.narrate_quoted_only = !!$('#tts_narrate_quoted').prop('checked'); saveSettingsDebounced() } function onNarrateTranslatedOnlyClick() { extension_settings.tts.narrate_translated_only = !!$('#tts_narrate_translated_only').prop('checked'); saveSettingsDebounced(); } //##############// // TTS Provider // //##############// async function loadTtsProvider(provider) { //Clear the current config and add new config $("#tts_provider_settings").html("") if (!provider) { return } // Init provider references extension_settings.tts.currentProvider = provider ttsProviderName = provider ttsProvider = new ttsProviders[provider] // Init provider settings $('#tts_provider_settings').append(ttsProvider.settingsHtml) if (!(ttsProviderName in extension_settings.tts)) { console.warn(`Provider ${ttsProviderName} not in Extension Settings, initiatilizing provider in settings`) extension_settings.tts[ttsProviderName] = {} } await ttsProvider.loadSettings(extension_settings.tts[ttsProviderName]) await initVoiceMap() } function onTtsProviderChange() { const ttsProviderSelection = $('#tts_provider').val() extension_settings.tts.currentProvider = ttsProviderSelection loadTtsProvider(ttsProviderSelection) } // Ensure that TTS provider settings are saved to extension settings. export function saveTtsProviderSettings() { updateVoiceMap() extension_settings.tts[ttsProviderName] = ttsProvider.settings saveSettingsDebounced() console.info(`Saved settings ${ttsProviderName} ${JSON.stringify(ttsProvider.settings)}`) } //###################// // voiceMap Handling // //###################// async function onChatChanged() { await resetTtsPlayback() await initVoiceMap() } function getCharacters(){ const context = getContext() let characters = [] if (context.groupId === null){ // Single char chat characters.push(DEFAULT_VOICE_MARKER) characters.push(context.name1) characters.push(context.name2) } else { // Group chat characters.push(DEFAULT_VOICE_MARKER) characters.push(context.name1) const group = context.groups.find(group => context.groupId == group.id) for (let member of group.members) { // Remove suffix if (member.endsWith('.png')){ member = member.slice(0, -4) } characters.push(member) } } return characters } function sanitizeId(input) { // Remove any non-alphanumeric characters except underscore (_) and hyphen (-) let sanitized = input.replace(/[^a-zA-Z0-9-_]/g, ''); // Ensure first character is always a letter if (!/^[a-zA-Z]/.test(sanitized)) { sanitized = 'element_' + sanitized; } return sanitized; } function parseVoiceMap(voiceMapString) { let parsedVoiceMap = {} for (const [charName, voiceId] of voiceMapString .split(',') .map(s => s.split(':'))) { if (charName && voiceId) { parsedVoiceMap[charName.trim()] = voiceId.trim() } } return parsedVoiceMap } /** * Apply voiceMap based on current voiceMapEntries */ function updateVoiceMap() { const tempVoiceMap = {} for (const voice of voiceMapEntries){ if (voice.voiceId === null){ continue } tempVoiceMap[voice.name] = voice.voiceId } if (Object.keys(tempVoiceMap).length !== 0){ voiceMap = tempVoiceMap console.log(`Voicemap updated to ${JSON.stringify(voiceMap)}`) } Object.assign(extension_settings.tts[ttsProviderName].voiceMap, voiceMap) saveSettingsDebounced() } class VoiceMapEntry { name voiceId selectElement constructor (name, voiceId=DEFAULT_VOICE_MARKER) { this.name = name this.voiceId = voiceId this.selectElement = null } addUI(voiceIds){ let sanitizedName = sanitizeId(this.name) let defaultOption = this.name === DEFAULT_VOICE_MARKER ? `` : `` let template = `
${this.name}
` $('#tts_voicemap_block').append(template) // Populate voice ID select list for (const voiceId of voiceIds){ const option = document.createElement('option'); option.innerText = voiceId.name; option.value = voiceId.name; $(`#tts_voicemap_char_${sanitizedName}_voice`).append(option) } this.selectElement = $(`#tts_voicemap_char_${sanitizedName}_voice`) this.selectElement.on('change', args => this.onSelectChange(args)) this.selectElement.val(this.voiceId) } onSelectChange(args) { this.voiceId = this.selectElement.find(':selected').val() updateVoiceMap() } } /** * Init voiceMapEntries for character select list. * */ export async function initVoiceMap(){ // Clear existing voiceMap state $('#tts_voicemap_block').empty() voiceMapEntries = [] // Gate initialization if not enabled or TTS Provider not ready. Prevents error popups. const enabled = $('#tts_enabled').is(':checked') if (!enabled){ return } // Keep errors inside extension UI rather than toastr. Toastr errors for TTS are annoying. try { await ttsProvider.checkReady() } catch (error) { const message = `TTS Provider not ready. ${error}` setTtsStatus(message, false) return } setTtsStatus("TTS Provider Loaded", true) // Get characters in current chat const characters = getCharacters() // Get saved voicemap from provider settings, handling new and old representations let voiceMapFromSettings = {} if ("voiceMap" in extension_settings.tts[ttsProviderName]) { // Handle previous representation if (typeof extension_settings.tts[ttsProviderName].voiceMap === "string"){ voiceMapFromSettings = parseVoiceMap(extension_settings.tts[ttsProviderName].voiceMap) // Handle new representation } else if (typeof extension_settings.tts[ttsProviderName].voiceMap === "object"){ voiceMapFromSettings = extension_settings.tts[ttsProviderName].voiceMap } } // Get voiceIds from provider let voiceIdsFromProvider try { voiceIdsFromProvider = await ttsProvider.fetchTtsVoiceObjects() } catch { toastr.error("TTS Provider failed to return voice ids.") } // Build UI using VoiceMapEntry objects for (const character of characters){ if (character === "SillyTavern System"){ continue } // Check provider settings for voiceIds let voiceId if (character in voiceMapFromSettings){ voiceId = voiceMapFromSettings[character] } else if (character === DEFAULT_VOICE_MARKER) { voiceId = DISABLED_VOICE_MARKER } else { voiceId = DEFAULT_VOICE_MARKER } const voiceMapEntry = new VoiceMapEntry(character, voiceId) voiceMapEntry.addUI(voiceIdsFromProvider) voiceMapEntries.push(voiceMapEntry) } updateVoiceMap() } $(document).ready(function () { function addExtensionControls() { const settingsHtml = `
TTS
Select TTS Provider

` $('#extensions_settings').append(settingsHtml) $('#tts_refresh').on('click', onRefreshClick) $('#tts_enabled').on('click', onEnableClick) $('#tts_narrate_dialogues').on('click', onNarrateDialoguesClick); $('#tts_narrate_quoted').on('click', onNarrateQuotedClick); $('#tts_narrate_translated_only').on('click', onNarrateTranslatedOnlyClick); $('#tts_auto_generation').on('click', onAutoGenerationClick); $('#tts_narrate_user').on('click', onNarrateUserClick); $('#tts_voices').on('click', onTtsVoicesClick) for (const provider in ttsProviders) { $('#tts_provider').append($("