Merge branch 'staging' into silerott-add-session-handling

2025-06-05 21:59:27 +02:00 · 2023-11-28 18:25:31 +02:00
parent 3346420527 f809d80ba2
commit a1098a4f31
118 changed files with 13535 additions and 3921 deletions
--- a/public/scripts/extensions/tts/elevenlabs.js
+++ b/public/scripts/extensions/tts/elevenlabs.js
@ -45,6 +45,8 @@ class ElevenLabsTtsProvider {
        this.settings.stability = $('#elevenlabs_tts_stability').val()
        this.settings.similarity_boost = $('#elevenlabs_tts_similarity_boost').val()
        this.settings.model = $('#elevenlabs_tts_model').find(':selected').val()
+        $('#elevenlabs_tts_stability_output').text(this.settings.stability);
+        $('#elevenlabs_tts_similarity_boost_output').text(this.settings.similarity_boost);
        saveTtsProviderSettings()
    }

@ -79,6 +81,8 @@ class ElevenLabsTtsProvider {
        $('#elevenlabs_tts_similarity_boost').on('input', this.onSettingsChange.bind(this))
        $('#elevenlabs_tts_stability').on('input', this.onSettingsChange.bind(this))
        $('#elevenlabs_tts_model').on('change', this.onSettingsChange.bind(this))
+        $('#elevenlabs_tts_stability_output').text(this.settings.stability);
+        $('#elevenlabs_tts_similarity_boost_output').text(this.settings.similarity_boost);

        try {
            await this.checkReady()
--- a/public/scripts/extensions/tts/index.js
+++ b/public/scripts/extensions/tts/index.js
@ -1,4 +1,4 @@
-import { callPopup, cancelTtsPlay, eventSource, event_types, saveSettingsDebounced } from '../../../script.js'
+import { callPopup, cancelTtsPlay, eventSource, event_types, name2, saveSettingsDebounced } from '../../../script.js'
 import { ModuleWorkerWrapper, doExtrasFetch, extension_settings, getApiUrl, getContext, modules } from '../../extensions.js'
 import { escapeRegex, getStringHash } from '../../utils.js'
 import { EdgeTtsProvider } from './edge.js'
@ -8,6 +8,9 @@ import { CoquiTtsProvider } from './coqui.js'
 import { SystemTtsProvider } from './system.js'
 import { NovelTtsProvider } from './novel.js'
 import { power_user } from '../../power-user.js'
+import { registerSlashCommand } from '../../slash-commands.js'
+import { OpenAITtsProvider } from './openai.js'
+import {XTTSTtsProvider} from "./xtts.js"
 export { talkingAnimation };

 const UPDATE_INTERVAL = 1000
@ -68,14 +71,18 @@ export function getPreviewString(lang) {
 let ttsProviders = {
    ElevenLabs: ElevenLabsTtsProvider,
    Silero: SileroTtsProvider,
+    XTTSv2: XTTSTtsProvider,
    System: SystemTtsProvider,
    Coqui: CoquiTtsProvider,
    Edge: EdgeTtsProvider,
    Novel: NovelTtsProvider,
+    OpenAI: OpenAITtsProvider,
 }
 let ttsProvider
 let ttsProviderName

+let ttsLastMessage = null;
+
 async function onNarrateOneMessage() {
    audioElement.src = '/sounds/silence.mp3';
    const context = getContext();
@ -91,6 +98,36 @@ async function onNarrateOneMessage() {
    moduleWorker();
 }

+async function onNarrateText(args, text) {
+    if (!text) {
+        return;
+    }
+
+    audioElement.src = '/sounds/silence.mp3';
+
+    // To load all characters in the voice map, set unrestricted to true
+    await initVoiceMap(true);
+
+    const baseName = args?.voice || name2;
+    const name = (baseName === 'SillyTavern System' ? DEFAULT_VOICE_MARKER : baseName) || DEFAULT_VOICE_MARKER;
+
+    const voiceMapEntry = voiceMap[name] === DEFAULT_VOICE_MARKER
+        ? voiceMap[DEFAULT_VOICE_MARKER]
+        : voiceMap[name];
+
+    if (!voiceMapEntry || voiceMapEntry === DISABLED_VOICE_MARKER) {
+        toastr.info(`Specified voice for ${name} was not found. Check the TTS extension settings.`);
+        return;
+    }
+
+    resetTtsPlayback()
+    ttsJobQueue.push({ mes: text, name: name });
+    await moduleWorker();
+
+    // Return back to the chat voices
+    await initVoiceMap(false);
+}
+
 async function moduleWorker() {
    // Primarily determining when to add new chat to the TTS queue
    const enabled = $('#tts_enabled').is(':checked')
@ -122,30 +159,53 @@ async function moduleWorker() {
    ) {
        currentMessageNumber = context.chat.length ? context.chat.length : 0
        saveLastValues()
+
+        // Force to speak on the first message in the new chat
+        if (context.chat.length === 1) {
+            lastMessageHash = -1;
+        }
+
        return
    }

    // take the count of messages
-    let lastMessageNumber = context.chat.length ? context.chat.length : 0
+    let lastMessageNumber = context.chat.length ? context.chat.length : 0;

    // There's no new messages
-    let diff = lastMessageNumber - currentMessageNumber
-    let hashNew = getStringHash((chat.length && chat[chat.length - 1].mes) ?? '')
+    let diff = lastMessageNumber - currentMessageNumber;
+    let hashNew = getStringHash((chat.length && chat[chat.length - 1].mes) ?? '');

-    if (diff == 0 && hashNew === lastMessageHash) {
-        return
+    // if messages got deleted, diff will be < 0
+    if (diff < 0) {
+        // necessary actions will be taken by the onChatDeleted() handler
+        return;
    }

-    const message = chat[chat.length - 1]
+    // if no new messages, or same message, or same message hash, do nothing
+    if (diff == 0 && hashNew === lastMessageHash) {
+        return;
+    }

-    // We're currently swiping or streaming. Don't generate voice
-    if (
-        !message ||
-        message.mes === '...' ||
-        message.mes === '' ||
-        (context.streamingProcessor && !context.streamingProcessor.isFinished)
-    ) {
-        return
+    // If streaming, wait for streaming to finish before processing new messages
+    if (context.streamingProcessor && !context.streamingProcessor.isFinished) {
+        return;
+    }
+
+    // clone message object, as things go haywire if message object is altered below (it's passed by reference)
+    const message = structuredClone(chat[chat.length - 1]);
+
+    // if last message within current message, message got extended. only send diff to TTS.
+    if (ttsLastMessage !== null && message.mes.indexOf(ttsLastMessage) !== -1) {
+        let tmp = message.mes;
+        message.mes = message.mes.replace(ttsLastMessage, '');
+        ttsLastMessage = tmp;
+    } else {
+        ttsLastMessage = message.mes;
+    }
+
+    // We're currently swiping. Don't generate voice
+    if (!message || message.mes === '...' || message.mes === '') {
+        return;
    }

    // Don't generate if message doesn't have a display text
@ -246,6 +306,7 @@ window.debugTtsPlayback = debugTtsPlayback
 //##################//

 let audioElement = new Audio()
+audioElement.id = 'tts_audio'
 audioElement.autoplay = true

 let audioJobQueue = []
@ -396,7 +457,7 @@ let currentTtsJob // Null if nothing is currently being processed
 let currentMessageNumber = 0

 function completeTtsJob() {
-    console.info(`Current TTS job for ${currentTtsJob.name} completed.`)
+    console.info(`Current TTS job for ${currentTtsJob?.name} completed.`)
    currentTtsJob = null
 }

@ -441,6 +502,14 @@ async function processTtsQueue() {
        const partJoiner = (ttsProvider?.separator || ' ... ');
        text = matches ? matches.join(partJoiner) : text;
    }
+
+    if (typeof ttsProvider?.processText === 'function') {
+        text = await ttsProvider.processText(text);
+    }
+
+    // Collapse newlines and spaces into single space
+    text = text.replace(/\s+/g, ' ').trim();
+
    console.log(`TTS: ${text}`)
    const char = currentTtsJob.name

@ -628,12 +697,44 @@ export function saveTtsProviderSettings() {
 async function onChatChanged() {
    await resetTtsPlayback()
    await initVoiceMap()
+    ttsLastMessage = null
 }

-function getCharacters(){
+async function onChatDeleted() {
    const context = getContext()
+
+    // update internal references to new last message
+    lastChatId = context.chatId
+    currentMessageNumber = context.chat.length ? context.chat.length : 0
+
+    // compare against lastMessageHash. If it's the same, we did not delete the last chat item, so no need to reset tts queue
+    let messageHash = getStringHash((context.chat.length && context.chat[context.chat.length - 1].mes) ?? '')
+    if (messageHash === lastMessageHash) {
+        return
+    }
+    lastMessageHash = messageHash
+    ttsLastMessage = (context.chat.length && context.chat[context.chat.length - 1].mes) ?? '';
+
+    // stop any tts playback since message might not exist anymore
+    await resetTtsPlayback()
+}
+
+/**
+ * Get characters in current chat
+ * @param {boolean} unrestricted - If true, will include all characters in voiceMapEntries, even if they are not in the current chat.
+ * @returns {string[]} - Array of character names
+ */
+function getCharacters(unrestricted) {
+    const context = getContext()
+
+    if (unrestricted) {
+        const names = context.characters.map(char => char.name);
+        names.unshift(DEFAULT_VOICE_MARKER);
+        return names;
+    }
+
    let characters = []
-    if (context.groupId === null){
+    if (context.groupId === null) {
        // Single char chat
        characters.push(DEFAULT_VOICE_MARKER)
        characters.push(context.name1)
@ -645,7 +746,7 @@ function getCharacters(){
        const group = context.groups.find(group => context.groupId == group.id)
        for (let member of group.members) {
            // Remove suffix
-            if (member.endsWith('.png')){
+            if (member.endsWith('.png')) {
                member = member.slice(0, -4)
            }
            characters.push(member)
@ -655,15 +756,15 @@ function getCharacters(){
 }

 function sanitizeId(input) {
-  // Remove any non-alphanumeric characters except underscore (_) and hyphen (-)
-  let sanitized = input.replace(/[^a-zA-Z0-9-_]/g, '');
+    // Remove any non-alphanumeric characters except underscore (_) and hyphen (-)
+    let sanitized = input.replace(/[^a-zA-Z0-9-_]/g, '');

-  // Ensure first character is always a letter
-  if (!/^[a-zA-Z]/.test(sanitized)) {
-    sanitized = 'element_' + sanitized;
-  }
+    // Ensure first character is always a letter
+    if (!/^[a-zA-Z]/.test(sanitized)) {
+        sanitized = 'element_' + sanitized;
+    }

-  return sanitized;
+    return sanitized;
 }

 function parseVoiceMap(voiceMapString) {
@ -685,13 +786,13 @@ function parseVoiceMap(voiceMapString) {
 */
 function updateVoiceMap() {
    const tempVoiceMap = {}
-    for (const voice of voiceMapEntries){
-        if (voice.voiceId === null){
+    for (const voice of voiceMapEntries) {
+        if (voice.voiceId === null) {
            continue
        }
        tempVoiceMap[voice.name] = voice.voiceId
    }
-    if (Object.keys(tempVoiceMap).length !== 0){
+    if (Object.keys(tempVoiceMap).length !== 0) {
        voiceMap = tempVoiceMap
        console.log(`Voicemap updated to ${JSON.stringify(voiceMap)}`)
    }
@ -706,13 +807,13 @@ class VoiceMapEntry {
    name
    voiceId
    selectElement
-    constructor (name, voiceId=DEFAULT_VOICE_MARKER) {
+    constructor(name, voiceId = DEFAULT_VOICE_MARKER) {
        this.name = name
        this.voiceId = voiceId
        this.selectElement = null
    }

-    addUI(voiceIds){
+    addUI(voiceIds) {
        let sanitizedName = sanitizeId(this.name)
        let defaultOption = this.name === DEFAULT_VOICE_MARKER ?
            `<option>${DISABLED_VOICE_MARKER}</option>` :
@ -728,7 +829,7 @@ class VoiceMapEntry {
        $('#tts_voicemap_block').append(template)

        // Populate voice ID select list
-        for (const voiceId of voiceIds){
+        for (const voiceId of voiceIds) {
            const option = document.createElement('option');
            option.innerText = voiceId.name;
            option.value = voiceId.name;
@ -748,12 +849,12 @@ class VoiceMapEntry {

 /**
 * Init voiceMapEntries for character select list.
- *
+ * @param {boolean} unrestricted - If true, will include all characters in voiceMapEntries, even if they are not in the current chat.
 */
-export async function initVoiceMap(){
+export async function initVoiceMap(unrestricted = false) {
    // Gate initialization if not enabled or TTS Provider not ready. Prevents error popups.
    const enabled = $('#tts_enabled').is(':checked')
-    if (!enabled){
+    if (!enabled) {
        return
    }

@ -771,18 +872,18 @@ export async function initVoiceMap(){
    // Clear existing voiceMap state
    $('#tts_voicemap_block').empty()
    voiceMapEntries = []
-    
+
    // Get characters in current chat
-    const characters = getCharacters()
+    const characters = getCharacters(unrestricted);

    // Get saved voicemap from provider settings, handling new and old representations
    let voiceMapFromSettings = {}
    if ("voiceMap" in extension_settings.tts[ttsProviderName]) {
        // Handle previous representation
-        if (typeof extension_settings.tts[ttsProviderName].voiceMap === "string"){
+        if (typeof extension_settings.tts[ttsProviderName].voiceMap === "string") {
            voiceMapFromSettings = parseVoiceMap(extension_settings.tts[ttsProviderName].voiceMap)
-        // Handle new representation
-        } else if (typeof extension_settings.tts[ttsProviderName].voiceMap === "object"){
+            // Handle new representation
+        } else if (typeof extension_settings.tts[ttsProviderName].voiceMap === "object") {
            voiceMapFromSettings = extension_settings.tts[ttsProviderName].voiceMap
        }
    }
@ -797,13 +898,13 @@ export async function initVoiceMap(){
    }

    // Build UI using VoiceMapEntry objects
-    for (const character of characters){
-        if (character === "SillyTavern System"){
+    for (const character of characters) {
+        if (character === "SillyTavern System") {
            continue
        }
        // Check provider settings for voiceIds
        let voiceId
-        if (character in voiceMapFromSettings){
+        if (character in voiceMapFromSettings) {
            voiceId = voiceMapFromSettings[character]
        } else if (character === DEFAULT_VOICE_MARKER) {
            voiceId = DISABLED_VOICE_MARKER
@ -897,5 +998,8 @@ $(document).ready(function () {
    setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL) // Init depends on all the things
    eventSource.on(event_types.MESSAGE_SWIPED, resetTtsPlayback);
    eventSource.on(event_types.CHAT_CHANGED, onChatChanged)
+    eventSource.on(event_types.MESSAGE_DELETED, onChatDeleted);
    eventSource.on(event_types.GROUP_UPDATED, onChatChanged)
+    registerSlashCommand('speak', onNarrateText, ['narrate', 'tts'], `<span class="monospace">(text)</span>  – narrate any text using currently selected character's voice. Use voice="Character Name" argument to set other voice from the voice map, example: <tt>/speak voice="Donald Duck" Quack!</tt>`, true, true);
+    document.body.appendChild(audioElement);
 })
--- a/public/scripts/extensions/tts/novel.js
+++ b/public/scripts/extensions/tts/novel.js
@ -19,6 +19,17 @@ class NovelTtsProvider {
        customVoices: []
    }

+    /**
+     * Perform any text processing before passing to TTS engine.
+     * @param {string} text Input text
+     * @returns {string} Processed text
+     */
+    processText(text) {
+        // Novel reads tilde as a word. Replace with full stop
+        text = text.replace(/~/g, '.');
+        return text;
+    }
+
    get settingsHtml() {
        let html = `
        <div class="novel_tts_hints">
--- a/public/scripts/extensions/tts/openai.js
+++ b/public/scripts/extensions/tts/openai.js
@ -0,0 +1,148 @@
+import { getRequestHeaders } from "../../../script.js"
+import { saveTtsProviderSettings } from "./index.js";
+
+export { OpenAITtsProvider }
+
+class OpenAITtsProvider {
+    static voices = [
+        { name: 'Alloy', voice_id: 'alloy', lang: 'en-US', preview_url: 'https://cdn.openai.com/API/docs/audio/alloy.wav' },
+        { name: 'Echo', voice_id: 'echo', lang: 'en-US', preview_url: 'https://cdn.openai.com/API/docs/audio/echo.wav' },
+        { name: 'Fable', voice_id: 'fable', lang: 'en-US', preview_url: 'https://cdn.openai.com/API/docs/audio/fable.wav' },
+        { name: 'Onyx', voice_id: 'onyx', lang: 'en-US', preview_url: 'https://cdn.openai.com/API/docs/audio/onyx.wav' },
+        { name: 'Nova', voice_id: 'nova', lang: 'en-US', preview_url: 'https://cdn.openai.com/API/docs/audio/nova.wav' },
+        { name: 'Shimmer', voice_id: 'shimmer', lang: 'en-US', preview_url: 'https://cdn.openai.com/API/docs/audio/shimmer.wav' },
+    ];
+
+    settings
+    voices = []
+    separator = ' . '
+    audioElement = document.createElement('audio')
+
+    defaultSettings = {
+        voiceMap: {},
+        customVoices: [],
+        model: 'tts-1',
+        speed: 1,
+    }
+
+    get settingsHtml() {
+        let html = `
+        <div>Use OpenAI's TTS engine.</div>
+        <small>Hint: Save an API key in the OpenAI API settings to use it here.</small>
+        <div>
+            <label for="openai-tts-model">Model:</label>
+            <select id="openai-tts-model">
+                <optgroup label="Latest">
+                    <option value="tts-1">tts-1</option>
+                    <option value="tts-1-hd">tts-1-hd</option>
+                </optgroup>
+                <optgroup label="Snapshots">
+                    <option value="tts-1-1106">tts-1-1106</option>
+                    <option value="tts-1-hd-1106">tts-1-hd-1106</option>
+                </optgroup>
+            <select>
+        </div>
+        <div>
+            <label for="openai-tts-speed">Speed: <span id="openai-tts-speed-output"></span></label>
+            <input type="range" id="openai-tts-speed" value="1" min="0.25" max="4" step="0.25">
+        </div>`;
+        return html;
+    }
+
+    async loadSettings(settings) {
+        // Populate Provider UI given input settings
+        if (Object.keys(settings).length == 0) {
+            console.info("Using default TTS Provider settings")
+        }
+
+        // Only accept keys defined in defaultSettings
+        this.settings = this.defaultSettings;
+
+        for (const key in settings) {
+            if (key in this.settings) {
+                this.settings[key] = settings[key];
+            } else {
+                throw `Invalid setting passed to TTS Provider: ${key}`;
+            }
+        }
+
+        $('#openai-tts-model').val(this.settings.model);
+        $('#openai-tts-model').on('change', () => {
+            this.onSettingsChange();
+        });
+
+        $('#openai-tts-speed').val(this.settings.speed);
+        $('#openai-tts-speed').on('input', () => {
+            this.onSettingsChange();
+        });
+
+        $('#openai-tts-speed-output').text(this.settings.speed);
+
+        await this.checkReady();
+        console.debug("OpenAI TTS: Settings loaded");
+    }
+
+    onSettingsChange() {
+        // Update dynamically
+        this.settings.model = String($('#openai-tts-model').find(':selected').val());
+        this.settings.speed = Number($('#openai-tts-speed').val());
+        $('#openai-tts-speed-output').text(this.settings.speed);
+        saveTtsProviderSettings();
+    }
+
+    async checkReady() {
+        await this.fetchTtsVoiceObjects();
+    }
+
+    async onRefreshClick() {
+        return;
+    }
+
+    async getVoice(voiceName) {
+        if (!voiceName) {
+            throw `TTS Voice name not provided`
+        }
+
+        const voice = OpenAITtsProvider.voices.find(voice => voice.voice_id === voiceName || voice.name === voiceName);
+
+        if (!voice) {
+            throw `TTS Voice not found: ${voiceName}`
+        }
+
+        return voice;
+    }
+
+    async generateTts(text, voiceId) {
+        const response = await this.fetchTtsGeneration(text, voiceId)
+        return response
+    }
+
+    async fetchTtsVoiceObjects() {
+        return OpenAITtsProvider.voices;
+    }
+
+    async previewTtsVoice(_) {
+        return;
+    }
+
+    async fetchTtsGeneration(inputText, voiceId) {
+        console.info(`Generating new TTS for voice_id ${voiceId}`)
+        const response = await fetch(`/api/openai/generate-voice`, {
+            method: 'POST',
+            headers: getRequestHeaders(),
+            body: JSON.stringify({
+                "text": inputText,
+                "voice": voiceId,
+                "model": this.settings.model,
+                "speed": this.settings.speed,
+            }),
+        });
+
+        if (!response.ok) {
+            toastr.error(response.statusText, 'TTS Generation Failed');
+            throw new Error(`HTTP ${response.status}: ${await response.text()}`);
+        }
+
+        return response;
+    }
+}
--- a/public/scripts/extensions/tts/readme.md
+++ b/public/scripts/extensions/tts/readme.md
@ -1,8 +1,8 @@
-# Provider Requirements. 
+# Provider Requirements.
 Because I don't know how, or if you can, and/or maybe I am just too lazy to implement interfaces in JS, here's the requirements of a provider that the extension needs to operate.

 ### class YourTtsProvider
-#### Required 
+#### Required
 Exported for use in extension index.js, and added to providers list in index.js
 1. generateTts(text, voiceId)
 2. fetchTtsVoiceObjects()
@ -13,8 +13,9 @@ Exported for use in extension index.js, and added to providers list in index.js
 7. settingsHtml field

 #### Optional
-1. previewTtsVoice() 
+1. previewTtsVoice()
 2. separator field
+3. processText(text)

 # Requirement Descriptions
 ### generateTts(text, voiceId)
@ -49,14 +50,14 @@ Return without error to let TTS extension know that the provider is ready.
 Return an error to block the main TTS extension for initializing the provider and UI. The error will be put in the TTS extension UI directly.

 ### loadSettings(settingsObject)
-Required. 
+Required.
 Handle the input settings from the TTS extension on provider load.
 Put code in here to load your provider settings.

 ### settings field
 Required, used for storing any provider state that needs to be saved.
 Anything stored in this field is automatically persisted under extension_settings[providerName] by the main extension in `saveTtsProviderSettings()`, as well as loaded when the provider is selected in `loadTtsProvider(provider)`.
-TTS extension doesn't expect any specific contents. 
+TTS extension doesn't expect any specific contents.

 ### settingsHtml field
 Required, injected into the TTS extension UI. Besides adding it, not relied on by TTS extension directly.
@ -68,4 +69,8 @@ Function to handle playing previews of voice samples if no direct preview_url is
 ### separator field
 Optional.
 Used when narrate quoted text is enabled.
-Defines the string of characters used to introduce separation between between the groups of extracted quoted text sent to the provider. The provider will use this to introduce pauses by default using `...` 
+Defines the string of characters used to introduce separation between between the groups of extracted quoted text sent to the provider. The provider will use this to introduce pauses by default using `...`
+
+### processText(text)
+Optional.
+A function applied to the input text before passing it to the TTS generator. Can be async.
--- a/public/scripts/extensions/tts/system.js
+++ b/public/scripts/extensions/tts/system.js
@ -146,8 +146,8 @@ class SystemTtsProvider {
        $('#system_tts_pitch').val(this.settings.pitch || this.defaultSettings.pitch);

        // Trigger updates
-        $('#system_tts_rate').on("input", () =>{this.onSettingsChange()})
-        $('#system_tts_rate').on("input", () => {this.onSettingsChange()})
+        $('#system_tts_rate').on("input", () => { this.onSettingsChange() })
+        $('#system_tts_rate').on("input", () => { this.onSettingsChange() })

        $('#system_tts_pitch_output').text(this.settings.pitch);
        $('#system_tts_rate_output').text(this.settings.rate);
@ -155,7 +155,7 @@ class SystemTtsProvider {
    }

    // Perform a simple readiness check by trying to fetch voiceIds
-    async checkReady(){
+    async checkReady() {
        await this.fetchTtsVoiceObjects()
    }

@ -171,10 +171,16 @@ class SystemTtsProvider {
            return [];
        }

-        return speechSynthesis
-            .getVoices()
-            .sort((a, b) => a.lang.localeCompare(b.lang) || a.name.localeCompare(b.name))
-            .map(x => ({ name: x.name, voice_id: x.voiceURI, preview_url: false, lang: x.lang }));
+        return new Promise((resolve) => {
+            setTimeout(() => {
+                const voices = speechSynthesis
+                    .getVoices()
+                    .sort((a, b) => a.lang.localeCompare(b.lang) || a.name.localeCompare(b.name))
+                    .map(x => ({ name: x.name, voice_id: x.voiceURI, preview_url: false, lang: x.lang }));
+
+                resolve(voices);
+            }, 1);
+        });
    }

    previewTtsVoice(voiceId) {
--- a/public/scripts/extensions/tts/xtts.js
+++ b/public/scripts/extensions/tts/xtts.js
@ -0,0 +1,207 @@
+import { doExtrasFetch, getApiUrl, modules } from "../../extensions.js"
+import { saveTtsProviderSettings } from "./index.js"
+
+export { XTTSTtsProvider }
+
+class XTTSTtsProvider {
+    //########//
+    // Config //
+    //########//
+
+    settings
+    ready = false
+    voices = []
+    separator = '. '
+
+    /**
+     * Perform any text processing before passing to TTS engine.
+     * @param {string} text Input text
+     * @returns {string} Processed text
+     */
+    processText(text) {
+        // Replace fancy ellipsis with "..."
+        text = text.replace(/…/g, '...');
+        // Remove quotes
+        text = text.replace(/["“”‘’]/g, '');
+        // Replace multiple "." with single "."
+        text = text.replace(/\.+/g, '.');
+        return text;
+    }
+
+    languageLabels = {
+        "Arabic": "ar",
+        "Brazilian Portuguese": "pt",
+        "Chinese": "zh-cn",
+        "Czech": "cs",
+        "Dutch": "nl",
+        "English": "en",
+        "French": "fr",
+        "German": "de",
+        "Italian": "it",
+        "Polish": "pl",
+        "Russian": "ru",
+        "Spanish": "es",
+        "Turkish": "tr",
+        "Japanese": "ja",
+        "Korean": "ko",
+        "Hungarian": "hu",
+        "Hindi": "hi",
+    }
+
+    defaultSettings = {
+        provider_endpoint: "http://localhost:8020",
+        language: "en",
+        voiceMap: {}
+    }
+
+    get settingsHtml() {
+        let html = `
+        <label for="xtts_api_language">Language</label>
+        <select id="xtts_api_language">`;
+
+
+        for (let language in this.languageLabels) {
+
+            if (this.languageLabels[language] == this.settings?.language) {
+                html += `<option value="${this.languageLabels[language]}" selected="selected">${language}</option>`;
+                continue
+            }
+
+            html += `<option value="${this.languageLabels[language]}">${language}</option>`;
+        }
+
+
+        html += `
+        </select>
+        <label for="xtts_tts_endpoint">Provider Endpoint:</label>
+        <input id="xtts_tts_endpoint" type="text" class="text_pole" maxlength="250" value="${this.defaultSettings.provider_endpoint}"/>
+
+        `;
+
+        html += `
+
+        <span>
+        <span>Use <a target="_blank" href="https://github.com/daswer123/xtts-api-server">XTTSv2 TTS Server</a>.</span>
+        `;
+
+        return html;
+    }
+    onSettingsChange() {
+        // Used when provider settings are updated from UI
+        this.settings.provider_endpoint = $('#xtts_tts_endpoint').val()
+        this.settings.language = $('#xtts_api_language').val()
+        saveTtsProviderSettings()
+    }
+
+    async loadSettings(settings) {
+        // Pupulate Provider UI given input settings
+        if (Object.keys(settings).length == 0) {
+            console.info("Using default TTS Provider settings")
+        }
+
+        // Only accept keys defined in defaultSettings
+        this.settings = this.defaultSettings
+
+        for (const key in settings) {
+            if (key in this.settings) {
+                this.settings[key] = settings[key]
+            } else {
+                throw `Invalid setting passed to TTS Provider: ${key}`
+            }
+        }
+
+        const apiCheckInterval = setInterval(() => {
+            // Use Extras API if TTS support is enabled
+            if (modules.includes('tts') || modules.includes('xtts-tts')) {
+                const baseUrl = new URL(getApiUrl());
+                baseUrl.pathname = '/api/tts';
+                this.settings.provider_endpoint = baseUrl.toString();
+                $('#xtts_tts_endpoint').val(this.settings.provider_endpoint);
+                clearInterval(apiCheckInterval);
+            }
+        }, 2000);
+
+        $('#xtts_tts_endpoint').val(this.settings.provider_endpoint)
+        $('#xtts_tts_endpoint').on("input", () => { this.onSettingsChange() })
+        $('#xtts_api_language').val(this.settings.language)
+        $('#xtts_api_language').on("change", () => { this.onSettingsChange() })
+
+        await this.checkReady()
+
+        console.debug("XTTS: Settings loaded")
+    }
+
+    // Perform a simple readiness check by trying to fetch voiceIds
+    async checkReady() {
+
+        const response = await this.fetchTtsVoiceObjects()
+    }
+
+    async onRefreshClick() {
+        return
+    }
+
+    //#################//
+    //  TTS Interfaces //
+    //#################//
+
+    async getVoice(voiceName) {
+        if (this.voices.length == 0) {
+            this.voices = await this.fetchTtsVoiceObjects()
+        }
+        const match = this.voices.filter(
+            XTTSVoice => XTTSVoice.name == voiceName
+        )[0]
+        if (!match) {
+            throw `TTS Voice name ${voiceName} not found`
+        }
+        return match
+    }
+
+    async generateTts(text, voiceId) {
+        const response = await this.fetchTtsGeneration(text, voiceId)
+        return response
+    }
+
+    //###########//
+    // API CALLS //
+    //###########//
+    async fetchTtsVoiceObjects() {
+        const response = await doExtrasFetch(`${this.settings.provider_endpoint}/speakers`)
+        if (!response.ok) {
+            throw new Error(`HTTP ${response.status}: ${await response.json()}`)
+        }
+        const responseJson = await response.json()
+        return responseJson
+    }
+
+    async fetchTtsGeneration(inputText, voiceId) {
+        console.info(`Generating new TTS for voice_id ${voiceId}`)
+        const response = await doExtrasFetch(
+            `${this.settings.provider_endpoint}/tts_to_audio/`,
+            {
+                method: 'POST',
+                headers: {
+                    'Content-Type': 'application/json',
+                    'Cache-Control': 'no-cache'  // Added this line to disable caching of file so new files are always played - Rolyat 7/7/23
+                },
+                body: JSON.stringify({
+                    "text": inputText,
+                    "speaker_wav": voiceId,
+                    "language": this.settings.language
+                })
+            }
+        )
+        if (!response.ok) {
+            toastr.error(response.statusText, 'TTS Generation Failed');
+            throw new Error(`HTTP ${response.status}: ${await response.text()}`);
+        }
+        return response
+    }
+
+    // Interface not used by XTTS TTS
+    async fetchTtsFromHistory(history_item_id) {
+        return Promise.resolve(history_item_id);
+    }
+
+}