Merge branch 'staging' into parser-followup-2

This commit is contained in:
LenAnderson
2024-07-04 11:37:35 -04:00
54 changed files with 2457 additions and 1285 deletions

View File

@@ -1,6 +1,6 @@
import { ensureImageFormatSupported, getBase64Async, isTrueBoolean, saveBase64AsFile } from '../../utils.js';
import { getContext, getApiUrl, doExtrasFetch, extension_settings, modules, renderExtensionTemplateAsync } from '../../extensions.js';
import { callPopup, getRequestHeaders, saveSettingsDebounced, substituteParamsExtended } from '../../../script.js';
import { appendMediaToMessage, callPopup, eventSource, event_types, getRequestHeaders, saveChatConditional, saveSettingsDebounced, substituteParamsExtended } from '../../../script.js';
import { getMessageTimeStamp } from '../../RossAscends-mods.js';
import { SECRET_KEYS, secret_state } from '../../secrets.js';
import { getMultimodalCaption } from '../shared.js';
@@ -84,12 +84,11 @@ async function setSpinnerIcon() {
}
/**
* Sends a captioned message to the chat.
* @param {string} caption Caption text
* @param {string} image Image URL
* Wraps a caption with a message template.
* @param {string} caption Raw caption
* @returns {Promise<string>} Wrapped caption
*/
async function sendCaptionedMessage(caption, image) {
const context = getContext();
async function wrapCaptionTemplate(caption) {
let template = extension_settings.caption.template || TEMPLATE_DEFAULT;
if (!/{{caption}}/i.test(template)) {
@@ -101,7 +100,7 @@ async function sendCaptionedMessage(caption, image) {
if (extension_settings.caption.refine_mode) {
messageText = await callPopup(
'<h3>Review and edit the generated message:</h3>Press "Cancel" to abort the caption sending.',
'<h3>Review and edit the generated caption:</h3>Press "Cancel" to abort the caption sending.',
'input',
messageText,
{ rows: 5, okButton: 'Send' });
@@ -111,6 +110,55 @@ async function sendCaptionedMessage(caption, image) {
}
}
return messageText;
}
/**
* Appends caption to an existing message.
* @param {Object} data Message data
* @returns {Promise<void>}
*/
async function captionExistingMessage(data) {
if (!(data?.extra?.image)) {
return;
}
const imageData = await fetch(data.extra.image);
const blob = await imageData.blob();
const type = imageData.headers.get('Content-Type');
const file = new File([blob], 'image.png', { type });
const caption = await getCaptionForFile(file, null, true);
if (!caption) {
console.warn('Failed to generate a caption for the image.');
return;
}
const wrappedCaption = await wrapCaptionTemplate(caption);
const messageText = String(data.mes).trim();
if (!messageText) {
data.extra.inline_image = false;
data.mes = wrappedCaption;
data.extra.title = wrappedCaption;
}
else {
data.extra.inline_image = true;
data.extra.append_title = true;
data.extra.title = wrappedCaption;
}
}
/**
* Sends a captioned message to the chat.
* @param {string} caption Caption text
* @param {string} image Image URL
*/
async function sendCaptionedMessage(caption, image) {
const messageText = await wrapCaptionTemplate(caption);
const context = getContext();
const message = {
name: context.name1,
is_user: true,
@@ -356,6 +404,7 @@ jQuery(async function () {
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'llamacpp' && textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ooba' && textgenerationwebui_settings.server_urls[textgen_types.OOBA]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'koboldcpp' && textgenerationwebui_settings.server_urls[textgen_types.KOBOLDCPP]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'vllm' && textgenerationwebui_settings.server_urls[textgen_types.VLLM]) ||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'custom') ||
extension_settings.caption.source === 'local' ||
extension_settings.caption.source === 'horde';
@@ -408,7 +457,7 @@ jQuery(async function () {
});
}
async function addSettings() {
const html = await renderExtensionTemplateAsync('caption', 'settings');
const html = await renderExtensionTemplateAsync('caption', 'settings', { TEMPLATE_DEFAULT, PROMPT_DEFAULT });
$('#caption_container').append(html);
}
@@ -422,6 +471,7 @@ jQuery(async function () {
$('#caption_refine_mode').prop('checked', !!(extension_settings.caption.refine_mode));
$('#caption_allow_reverse_proxy').prop('checked', !!(extension_settings.caption.allow_reverse_proxy));
$('#caption_prompt_ask').prop('checked', !!(extension_settings.caption.prompt_ask));
$('#caption_auto_mode').prop('checked', !!(extension_settings.caption.auto_mode));
$('#caption_source').val(extension_settings.caption.source);
$('#caption_prompt').val(extension_settings.caption.prompt);
$('#caption_template').val(extension_settings.caption.template);
@@ -447,6 +497,41 @@ jQuery(async function () {
extension_settings.caption.prompt_ask = $('#caption_prompt_ask').prop('checked');
saveSettingsDebounced();
});
$('#caption_auto_mode').on('input', () => {
extension_settings.caption.auto_mode = !!$('#caption_auto_mode').prop('checked');
saveSettingsDebounced();
});
const onMessageEvent = async (index) => {
if (!extension_settings.caption.auto_mode) {
return;
}
const data = getContext().chat[index];
await captionExistingMessage(data);
};
eventSource.on(event_types.MESSAGE_SENT, onMessageEvent);
eventSource.on(event_types.MESSAGE_FILE_EMBEDDED, onMessageEvent);
$(document).on('click', '.mes_img_caption', async function () {
const animationClass = 'fa-fade';
const messageBlock = $(this).closest('.mes');
const messageImg = messageBlock.find('.mes_img');
if (messageImg.hasClass(animationClass)) return;
messageImg.addClass(animationClass);
try {
const index = Number(messageBlock.attr('mesid'));
const data = getContext().chat[index];
await captionExistingMessage(data);
appendMediaToMessage(data, messageBlock, false);
await saveChatConditional();
} catch(e) {
console.error('Message image recaption failed', e);
} finally {
messageImg.removeClass(animationClass);
}
});
SlashCommandParser.addCommandObject(SlashCommand.fromProps({ name: 'caption',
callback: captionCommandCallback,
@@ -482,4 +567,6 @@ jQuery(async function () {
</div>
`,
}));
document.body.classList.add('caption');
});

View File

@@ -26,6 +26,7 @@
<option value="openai">OpenAI</option>
<option value="openrouter">OpenRouter</option>
<option value="ooba" data-i18n="Text Generation WebUI (oobabooga)">Text Generation WebUI (oobabooga)</option>
<option value="vllm">vLLM</option>
</select>
</div>
<div class="flex1 flex-container flexFlowColumn flexNoGap">
@@ -66,6 +67,7 @@
<option data-type="llamacpp" value="llamacpp_current" data-i18n="currently_loaded">[Currently loaded]</option>
<option data-type="ooba" value="ooba_current" data-i18n="currently_loaded">[Currently loaded]</option>
<option data-type="koboldcpp" value="koboldcpp_current" data-i18n="currently_loaded">[Currently loaded]</option>
<option data-type="vllm" value="vllm_current" data-i18n="currently_selected">[Currently selected]</option>
<option data-type="custom" value="custom_current" data-i18n="currently_selected">[Currently selected]</option>
</select>
</div>
@@ -82,14 +84,19 @@
</div>
<div id="caption_prompt_block">
<label for="caption_prompt" data-i18n="Caption Prompt">Caption Prompt</label>
<textarea id="caption_prompt" class="text_pole" rows="1" placeholder="&lt; Use default &gt;">${PROMPT_DEFAULT}</textarea>
<textarea id="caption_prompt" class="text_pole" rows="1" placeholder="&lt; Use default &gt;">{{PROMPT_DEFAULT}}</textarea>
<label class="checkbox_label margin-bot-10px" for="caption_prompt_ask" title="Ask for a custom prompt every time an image is captioned.">
<input id="caption_prompt_ask" type="checkbox" class="checkbox">
<span data-i18n="Ask every time">Ask every time</span>
</label>
</div>
<label for="caption_template"><span data-i18n="Message Template">Message Template</span> <small><span data-i18n="(use _space">(use </span> <code>&lcub;&lcub;caption&rcub;&rcub;</code> <span data-i18n="macro)">macro)</span></small></label>
<textarea id="caption_template" class="text_pole" rows="2" placeholder="&lt; Use default &gt;">${TEMPLATE_DEFAULT}</textarea>
<textarea id="caption_template" class="text_pole" rows="2" placeholder="&lt; Use default &gt;">{{TEMPLATE_DEFAULT}}</textarea>
<label class="checkbox_label" for="caption_auto_mode">
<input id="caption_auto_mode" type="checkbox" class="checkbox">
<span data-i18n="Automatically caption images">Automatically caption images</span>
<i class="fa-solid fa-info-circle" title="Automatically caption images when they are pasted into the chat or attached to messages."></i>
</label>
<label class="checkbox_label margin-bot-10px" for="caption_refine_mode">
<input id="caption_refine_mode" type="checkbox" class="checkbox">
<span data-i18n="Edit captions before saving">Edit captions before saving</span>

View File

@@ -34,6 +34,7 @@ export async function getMultimodalCaption(base64Img, prompt) {
const isCustom = extension_settings.caption.multimodal_api === 'custom';
const isOoba = extension_settings.caption.multimodal_api === 'ooba';
const isKoboldCpp = extension_settings.caption.multimodal_api === 'koboldcpp';
const isVllm = extension_settings.caption.multimodal_api === 'vllm';
const base64Bytes = base64Img.length * 0.75;
const compressionLimit = 2 * 1024 * 1024;
if ((['google', 'openrouter'].includes(extension_settings.caption.multimodal_api) && base64Bytes > compressionLimit) || isOoba || isKoboldCpp) {
@@ -65,6 +66,14 @@ export async function getMultimodalCaption(base64Img, prompt) {
requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.OLLAMA];
}
if (isVllm) {
if (extension_settings.caption.multimodal_model === 'vllm_current') {
requestBody.model = textgenerationwebui_settings.vllm_model;
}
requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.VLLM];
}
if (isLlamaCpp) {
requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP];
}
@@ -151,6 +160,14 @@ function throwIfInvalidModel(useReverseProxy) {
throw new Error('KoboldCpp server URL is not set.');
}
if (extension_settings.caption.multimodal_api === 'vllm' && !textgenerationwebui_settings.server_urls[textgen_types.VLLM]) {
throw new Error('vLLM server URL is not set.');
}
if (extension_settings.caption.multimodal_api === 'vllm' && extension_settings.caption.multimodal_model === 'vllm_current' && !textgenerationwebui_settings.vllm_model) {
throw new Error('vLLM model is not set.');
}
if (extension_settings.caption.multimodal_api === 'custom' && !oai_settings.custom_url) {
throw new Error('Custom API URL is not set.');
}

View File

@@ -3,7 +3,6 @@ import {
systemUserName,
hideSwipeButtons,
showSwipeButtons,
callPopup,
getRequestHeaders,
event_types,
eventSource,
@@ -29,10 +28,9 @@ import { getMultimodalCaption } from '../shared.js';
import { SlashCommandParser } from '../../slash-commands/SlashCommandParser.js';
import { SlashCommand } from '../../slash-commands/SlashCommand.js';
import { ARGUMENT_TYPE, SlashCommandArgument, SlashCommandNamedArgument } from '../../slash-commands/SlashCommandArgument.js';
import { resolveVariable } from '../../variables.js';
import { debounce_timeout } from '../../constants.js';
import { commonEnumProviders } from '../../slash-commands/SlashCommandCommonEnumsProvider.js';
import { SlashCommandEnumValue } from '../../slash-commands/SlashCommandEnumValue.js';
import { POPUP_TYPE, callGenericPopup } from '../../popup.js';
export { MODULE_NAME };
const MODULE_NAME = 'sd';
@@ -572,7 +570,7 @@ async function onDeleteStyleClick() {
return;
}
const confirmed = await callPopup(`Are you sure you want to delete the style "${selectedStyle}"?`, 'confirm', '', { okButton: 'Delete' });
const confirmed = await callGenericPopup(`Are you sure you want to delete the style "${selectedStyle}"?`, POPUP_TYPE.CONFIRM, '', { okButton: 'Delete', cancelButton: 'Cancel' });
if (!confirmed) {
return;
@@ -601,7 +599,7 @@ async function onDeleteStyleClick() {
}
async function onSaveStyleClick() {
const userInput = await callPopup('Enter style name:', 'input', '', { okButton: 'Save' });
const userInput = await callGenericPopup('Enter style name:', POPUP_TYPE.INPUT);
if (!userInput) {
return;
@@ -670,7 +668,7 @@ async function refinePrompt(prompt, allowExpand, isNegative = false) {
if (extension_settings.sd.refine_mode) {
const text = isNegative ? '<h3>Review and edit the <i>negative</i> prompt:</h3>' : '<h3>Review and edit the prompt:</h3>';
const refinedPrompt = await callPopup(text + 'Press "Cancel" to abort the image generation.', 'input', prompt.trim(), { rows: 5, okButton: 'Continue' });
const refinedPrompt = await callGenericPopup(text + 'Press "Cancel" to abort the image generation.', POPUP_TYPE.INPUT, prompt.trim(), { rows: 5, okButton: 'Continue' });
if (refinedPrompt) {
return refinedPrompt;
@@ -2918,25 +2916,25 @@ async function generateComfyImage(prompt, negativePrompt) {
const text = await workflowResponse.text();
toastr.error(`Failed to load workflow.\n\n${text}`);
}
let workflow = (await workflowResponse.json()).replace('"%prompt%"', JSON.stringify(prompt));
workflow = workflow.replace('"%negative_prompt%"', JSON.stringify(negativePrompt));
let workflow = (await workflowResponse.json()).replaceAll('"%prompt%"', JSON.stringify(prompt));
workflow = workflow.replaceAll('"%negative_prompt%"', JSON.stringify(negativePrompt));
const seed = extension_settings.sd.seed >= 0 ? extension_settings.sd.seed : Math.round(Math.random() * Number.MAX_SAFE_INTEGER);
workflow = workflow.replaceAll('"%seed%"', JSON.stringify(seed));
placeholders.forEach(ph => {
workflow = workflow.replace(`"%${ph}%"`, JSON.stringify(extension_settings.sd[ph]));
workflow = workflow.replaceAll(`"%${ph}%"`, JSON.stringify(extension_settings.sd[ph]));
});
(extension_settings.sd.comfy_placeholders ?? []).forEach(ph => {
workflow = workflow.replace(`"%${ph.find}%"`, JSON.stringify(substituteParams(ph.replace)));
workflow = workflow.replaceAll(`"%${ph.find}%"`, JSON.stringify(substituteParams(ph.replace)));
});
if (/%user_avatar%/gi.test(workflow)) {
const response = await fetch(getUserAvatarUrl());
if (response.ok) {
const avatarBlob = await response.blob();
const avatarBase64 = await getBase64Async(avatarBlob);
workflow = workflow.replace('"%user_avatar%"', JSON.stringify(avatarBase64));
workflow = workflow.replaceAll('"%user_avatar%"', JSON.stringify(avatarBase64));
} else {
workflow = workflow.replace('"%user_avatar%"', JSON.stringify(PNG_PIXEL));
workflow = workflow.replaceAll('"%user_avatar%"', JSON.stringify(PNG_PIXEL));
}
}
if (/%char_avatar%/gi.test(workflow)) {
@@ -2944,9 +2942,9 @@ async function generateComfyImage(prompt, negativePrompt) {
if (response.ok) {
const avatarBlob = await response.blob();
const avatarBase64 = await getBase64Async(avatarBlob);
workflow = workflow.replace('"%char_avatar%"', JSON.stringify(avatarBase64));
workflow = workflow.replaceAll('"%char_avatar%"', JSON.stringify(avatarBase64));
} else {
workflow = workflow.replace('"%char_avatar%"', JSON.stringify(PNG_PIXEL));
workflow = workflow.replaceAll('"%char_avatar%"', JSON.stringify(PNG_PIXEL));
}
}
console.log(`{
@@ -2978,7 +2976,7 @@ async function onComfyOpenWorkflowEditorClick() {
}),
})).json();
const editorHtml = $(await $.get('scripts/extensions/stable-diffusion/comfyWorkflowEditor.html'));
const popupResult = callPopup(editorHtml, 'confirm', undefined, { okButton: 'Save', wide: true, large: true, rows: 1 });
const popupResult = callGenericPopup(editorHtml, POPUP_TYPE.CONFIRM, '', { okButton: 'Save', cancelButton: 'Cancel', wide: true, large: true });
const checkPlaceholders = () => {
workflow = $('#sd_comfy_workflow_editor_workflow').val().toString();
$('.sd_comfy_workflow_editor_placeholder_list > li[data-placeholder]').each(function (idx) {
@@ -3058,7 +3056,7 @@ async function onComfyOpenWorkflowEditorClick() {
}
async function onComfyNewWorkflowClick() {
let name = await callPopup('<h3>Workflow name:</h3>', 'input');
let name = await callGenericPopup('Workflow name:', POPUP_TYPE.INPUT);
if (!name) {
return;
}
@@ -3085,7 +3083,7 @@ async function onComfyNewWorkflowClick() {
}
async function onComfyDeleteWorkflowClick() {
const confirm = await callPopup('Delete the workflow? This action is irreversible.', 'confirm');
const confirm = await callGenericPopup('Delete the workflow? This action is irreversible.', POPUP_TYPE.CONFIRM, '', { okButton: 'Delete', cancelButton: 'Cancel' });
if (!confirm) {
return;
}

View File

@@ -1,4 +1,5 @@
import { callPopup, getRequestHeaders } from '../../../script.js';
import { getRequestHeaders } from '../../../script.js';
import { POPUP_TYPE, callGenericPopup } from '../../popup.js';
import { SECRET_KEYS, findSecret, secret_state, writeSecret } from '../../secrets.js';
import { getPreviewString, saveTtsProviderSettings } from './index.js';
export { AzureTtsProvider };
@@ -69,13 +70,13 @@ class AzureTtsProvider {
const popupText = 'Azure TTS API Key';
const savedKey = secret_state[SECRET_KEYS.AZURE_TTS] ? await findSecret(SECRET_KEYS.AZURE_TTS) : '';
const key = await callPopup(popupText, 'input', savedKey);
const key = await callGenericPopup(popupText, POPUP_TYPE.INPUT, savedKey);
if (key == false || key == '') {
return;
}
await writeSecret(SECRET_KEYS.AZURE_TTS, key);
await writeSecret(SECRET_KEYS.AZURE_TTS, String(key));
toastr.success('API Key saved');
$('#azure_tts_key').addClass('success');

View File

@@ -5,8 +5,8 @@ TODO:
*/
import { doExtrasFetch, extension_settings, getApiUrl, modules } from '../../extensions.js';
import { callPopup } from '../../../script.js';
import { initVoiceMap } from './index.js';
import { POPUP_TYPE, callGenericPopup } from '../../popup.js';
export { CoquiTtsProvider };
@@ -246,7 +246,7 @@ class CoquiTtsProvider {
}
// Ask user for voiceId name to save voice
const voiceName = await callPopup('<h3>Name of Coqui voice to add to voice select dropdown:</h3>', 'input');
const voiceName = await callGenericPopup('Name of Coqui voice to add to voice select dropdown:', POPUP_TYPE.INPUT);
const model_origin = $('#coqui_model_origin').val();
const model_language = $('#coqui_api_language').val();

View File

@@ -1,4 +1,4 @@
import { cancelTtsPlay, eventSource, event_types, isStreamingEnabled, name2, saveSettingsDebounced, substituteParams } from '../../../script.js';
import { cancelTtsPlay, eventSource, event_types, getCurrentChatId, isStreamingEnabled, name2, saveSettingsDebounced, substituteParams } from '../../../script.js';
import { ModuleWorkerWrapper, doExtrasFetch, extension_settings, getApiUrl, getContext, modules, renderExtensionTemplateAsync } from '../../extensions.js';
import { delay, escapeRegex, getBase64Async, getStringHash, onlyUnique } from '../../utils.js';
import { EdgeTtsProvider } from './edge.js';
@@ -10,6 +10,7 @@ import { NovelTtsProvider } from './novel.js';
import { power_user } from '../../power-user.js';
import { OpenAITtsProvider } from './openai.js';
import { XTTSTtsProvider } from './xtts.js';
import { VITSTtsProvider } from './vits.js';
import { GSVITtsProvider } from './gsvi.js';
import { SBVits2TtsProvider } from './sbvits2.js';
import { AllTalkTtsProvider } from './alltalk.js';
@@ -34,6 +35,7 @@ let lastMessage = null;
let lastMessageHash = null;
let periodicMessageGenerationTimer = null;
let lastPositionOfParagraphEnd = -1;
let currentInitVoiceMapPromise = null;
const DEFAULT_VOICE_MARKER = '[Default Voice]';
const DISABLED_VOICE_MARKER = 'disabled';
@@ -83,6 +85,7 @@ const ttsProviders = {
ElevenLabs: ElevenLabsTtsProvider,
Silero: SileroTtsProvider,
XTTSv2: XTTSTtsProvider,
VITS: VITSTtsProvider,
GSVI: GSVITtsProvider,
SBVits2: SBVits2TtsProvider,
System: SystemTtsProvider,
@@ -1008,9 +1011,39 @@ class VoiceMapEntry {
/**
* Init voiceMapEntries for character select list.
* If an initialization is already in progress, it returns the existing Promise instead of starting a new one.
* @param {boolean} unrestricted - If true, will include all characters in voiceMapEntries, even if they are not in the current chat.
* @returns {Promise} A promise that resolves when the initialization is complete.
*/
export async function initVoiceMap(unrestricted = false) {
// Preventing parallel execution
if (currentInitVoiceMapPromise) {
return currentInitVoiceMapPromise;
}
currentInitVoiceMapPromise = (async () => {
const initialChatId = getCurrentChatId();
try {
await initVoiceMapInternal(unrestricted);
} finally {
currentInitVoiceMapPromise = null;
}
const currentChatId = getCurrentChatId();
if (initialChatId !== currentChatId) {
// Chat changed during initialization, reinitialize
await initVoiceMap(unrestricted);
}
})();
return currentInitVoiceMapPromise;
}
/**
* Init voiceMapEntries for character select list.
* @param {boolean} unrestricted - If true, will include all characters in voiceMapEntries, even if they are not in the current chat.
*/
async function initVoiceMapInternal(unrestricted) {
// Gate initialization if not enabled or TTS Provider not ready. Prevents error popups.
const enabled = $('#tts_enabled').is(':checked');
if (!enabled) {

View File

@@ -1,4 +1,5 @@
import { getRequestHeaders, callPopup } from '../../../script.js';
import { getRequestHeaders } from '../../../script.js';
import { POPUP_TYPE, callGenericPopup } from '../../popup.js';
import { splitRecursive } from '../../utils.js';
import { getPreviewString, saveTtsProviderSettings } from './index.js';
import { initVoiceMap } from './index.js';
@@ -56,7 +57,7 @@ class NovelTtsProvider {
// Add a new Novel custom voice to provider
async addCustomVoice() {
const voiceName = await callPopup('<h3>Custom Voice name:</h3>', 'input');
const voiceName = await callGenericPopup('Custom Voice name:', POPUP_TYPE.INPUT);
this.settings.customVoices.push(voiceName);
this.populateCustomVoices();
initVoiceMap(); // Update TTS extension voiceMap

View File

@@ -0,0 +1,404 @@
import { getPreviewString, saveTtsProviderSettings } from './index.js';
export { VITSTtsProvider };
class VITSTtsProvider {
//########//
// Config //
//########//
settings;
ready = false;
voices = [];
separator = '. ';
audioElement = document.createElement('audio');
/**
* Perform any text processing before passing to TTS engine.
* @param {string} text Input text
* @returns {string} Processed text
*/
processText(text) {
return text;
}
audioFormats = ['wav', 'ogg', 'silk', 'mp3', 'flac'];
languageLabels = {
'Auto': 'auto',
'Chinese': 'zh',
'English': 'en',
'Japanese': 'ja',
'Korean': 'ko',
};
langKey2LangCode = {
'zh': 'zh-CN',
'en': 'en-US',
'ja': 'ja-JP',
'ko': 'ko-KR',
};
modelTypes = {
VITS: 'VITS',
W2V2_VITS: 'W2V2-VITS',
BERT_VITS2: 'BERT-VITS2',
};
defaultSettings = {
provider_endpoint: 'http://localhost:23456',
format: 'wav',
lang: 'auto',
length: 1.0,
noise: 0.33,
noisew: 0.4,
segment_size: 50,
streaming: false,
dim_emotion: 0,
sdp_ratio: 0.2,
emotion: 0,
text_prompt: '',
style_text: '',
style_weight: 1,
};
get settingsHtml() {
let html = `
<label for="vits_lang">Text Language</label>
<select id="vits_lang">`;
for (let language in this.languageLabels) {
if (this.languageLabels[language] == this.settings?.lang) {
html += `<option value="${this.languageLabels[language]}" selected="selected">${language}</option>`;
continue;
}
html += `<option value="${this.languageLabels[language]}">${language}</option>`;
}
html += `
</select>
<label>VITS / W2V2-VITS / Bert-VITS2 Settings:</label><br/>
<label for="vits_endpoint">Provider Endpoint:</label>
<input id="vits_endpoint" type="text" class="text_pole" maxlength="250" value="${this.defaultSettings.provider_endpoint}"/>
<span>Use <a target="_blank" href="https://github.com/Artrajz/vits-simple-api">vits-simple-api</a>.</span><br/>
<label for="vits_format">Audio format:</label>
<select id="vits_format">`;
for (let format of this.audioFormats) {
if (format == this.settings?.format) {
html += `<option value="${format}" selected="selected">${format}</option>`;
continue;
}
html += `<option value="${format}">${format}</option>`;
}
html += `
</select>
<label for="vits_length">Audio length: <span id="vits_length_output">${this.defaultSettings.length}</span></label>
<input id="vits_length" type="range" value="${this.defaultSettings.length}" min="0.0" max="5" step="0.01" />
<label for="vits_noise">Noise: <span id="vits_noise_output">${this.defaultSettings.noise}</span></label>
<input id="vits_noise" type="range" value="${this.defaultSettings.noise}" min="0.1" max="2" step="0.01" />
<label for="vits_noisew">SDP noise: <span id="vits_noisew_output">${this.defaultSettings.noisew}</span></label>
<input id="vits_noisew" type="range" value="${this.defaultSettings.noisew}" min="0.1" max="2" step="0.01" />
<label for="vits_segment_size">Segment Size: <span id="vits_segment_size_output">${this.defaultSettings.segment_size}</span></label>
<input id="vits_segment_size" type="range" value="${this.defaultSettings.segment_size}" min="0" max="1000" step="1" />
<label for="vits_streaming" class="checkbox_label">
<input id="vits_streaming" type="checkbox" />
<span>Streaming</span>
</label>
<label>W2V2-VITS Settings:</label><br/>
<label for="vits_dim_emotion">Dimensional emotion:</label>
<input id="vits_dim_emotion" type="number" class="text_pole" min="0" max="5457" step="1" value="${this.defaultSettings.dim_emotion}"/>
<label>BERT-VITS2 Settings:</label><br/>
<label for="vits_sdp_ratio">sdp_ratio: <span id="vits_sdp_ratio_output">${this.defaultSettings.sdp_ratio}</span></label>
<input id="vits_sdp_ratio" type="range" value="${this.defaultSettings.sdp_ratio}" min="0.0" max="1" step="0.01" />
<label for="vits_emotion">emotion: <span id="vits_emotion_output">${this.defaultSettings.emotion}</span></label>
<input id="vits_emotion" type="range" value="${this.defaultSettings.emotion}" min="0" max="9" step="1" />
<label for="vits_text_prompt">Text Prompt:</label>
<input id="vits_text_prompt" type="text" class="text_pole" maxlength="512" value="${this.defaultSettings.text_prompt}"/>
<label for="vits_style_text">Style text:</label>
<input id="vits_style_text" type="text" class="text_pole" maxlength="512" value="${this.defaultSettings.style_text}"/>
<label for="vits_style_weight">Style weight <span id="vits_style_weight_output">${this.defaultSettings.style_weight}</span></label>
<input id="vits_style_weight" type="range" value="${this.defaultSettings.style_weight}" min="0" max="1" step="0.01" />
`;
return html;
}
onSettingsChange() {
// Used when provider settings are updated from UI
this.settings.provider_endpoint = $('#vits_endpoint').val();
this.settings.lang = $('#vits_lang').val();
this.settings.format = $('#vits_format').val();
this.settings.dim_emotion = $('#vits_dim_emotion').val();
this.settings.text_prompt = $('#vits_text_prompt').val();
this.settings.style_text = $('#vits_style_text').val();
// Update the default TTS settings based on input fields
this.settings.length = $('#vits_length').val();
this.settings.noise = $('#vits_noise').val();
this.settings.noisew = $('#vits_noisew').val();
this.settings.segment_size = $('#vits_segment_size').val();
this.settings.streaming = $('#vits_streaming').is(':checked');
this.settings.sdp_ratio = $('#vits_sdp_ratio').val();
this.settings.emotion = $('#vits_emotion').val();
this.settings.style_weight = $('#vits_style_weight').val();
// Update the UI to reflect changes
$('#vits_length_output').text(this.settings.length);
$('#vits_noise_output').text(this.settings.noise);
$('#vits_noisew_output').text(this.settings.noisew);
$('#vits_segment_size_output').text(this.settings.segment_size);
$('#vits_sdp_ratio_output').text(this.settings.sdp_ratio);
$('#vits_emotion_output').text(this.settings.emotion);
$('#vits_style_weight_output').text(this.settings.style_weight);
saveTtsProviderSettings();
this.changeTTSSettings();
}
async loadSettings(settings) {
// Pupulate Provider UI given input settings
if (Object.keys(settings).length == 0) {
console.info('Using default TTS Provider settings');
}
// Only accept keys defined in defaultSettings
this.settings = this.defaultSettings;
for (const key in settings) {
if (key in this.settings) {
this.settings[key] = settings[key];
} else {
console.debug(`Ignoring non-user-configurable setting: ${key}`);
}
}
// Set initial values from the settings
$('#vits_endpoint').val(this.settings.provider_endpoint);
$('#vits_lang').val(this.settings.lang);
$('#vits_format').val(this.settings.format);
$('#vits_length').val(this.settings.length);
$('#vits_noise').val(this.settings.noise);
$('#vits_noisew').val(this.settings.noisew);
$('#vits_segment_size').val(this.settings.segment_size);
$('#vits_streaming').prop('checked', this.settings.streaming);
$('#vits_dim_emotion').val(this.settings.dim_emotion);
$('#vits_sdp_ratio').val(this.settings.sdp_ratio);
$('#vits_emotion').val(this.settings.emotion);
$('#vits_text_prompt').val(this.settings.text_prompt);
$('#vits_style_text').val(this.settings.style_text);
$('#vits_style_weight').val(this.settings.style_weight);
// Update the UI to reflect changes
$('#vits_length_output').text(this.settings.length);
$('#vits_noise_output').text(this.settings.noise);
$('#vits_noisew_output').text(this.settings.noisew);
$('#vits_segment_size_output').text(this.settings.segment_size);
$('#vits_sdp_ratio_output').text(this.settings.sdp_ratio);
$('#vits_emotion_output').text(this.settings.emotion);
$('#vits_style_weight_output').text(this.settings.style_weight);
// Register input/change event listeners to update settings on user interaction
$('#vits_endpoint').on('input', () => { this.onSettingsChange(); });
$('#vits_lang').on('change', () => { this.onSettingsChange(); });
$('#vits_format').on('change', () => { this.onSettingsChange(); });
$('#vits_length').on('change', () => { this.onSettingsChange(); });
$('#vits_noise').on('change', () => { this.onSettingsChange(); });
$('#vits_noisew').on('change', () => { this.onSettingsChange(); });
$('#vits_segment_size').on('change', () => { this.onSettingsChange(); });
$('#vits_streaming').on('change', () => { this.onSettingsChange(); });
$('#vits_dim_emotion').on('change', () => { this.onSettingsChange(); });
$('#vits_sdp_ratio').on('change', () => { this.onSettingsChange(); });
$('#vits_emotion').on('change', () => { this.onSettingsChange(); });
$('#vits_text_prompt').on('change', () => { this.onSettingsChange(); });
$('#vits_style_text').on('change', () => { this.onSettingsChange(); });
$('#vits_style_weight').on('change', () => { this.onSettingsChange(); });
await this.checkReady();
console.info('VITS: Settings loaded');
}
// Perform a simple readiness check by trying to fetch voiceIds
async checkReady() {
await Promise.allSettled([this.fetchTtsVoiceObjects(), this.changeTTSSettings()]);
}
async onRefreshClick() {
return;
}
//#################//
// TTS Interfaces //
//#################//
async getVoice(voiceName) {
if (this.voices.length == 0) {
this.voices = await this.fetchTtsVoiceObjects();
}
const match = this.voices.filter(
v => v.name == voiceName,
)[0];
if (!match) {
throw `TTS Voice name ${voiceName} not found`;
}
return match;
}
async getVoiceById(voiceId) {
if (this.voices.length == 0) {
this.voices = await this.fetchTtsVoiceObjects();
}
const match = this.voices.filter(
v => v.voice_id == voiceId,
)[0];
if (!match) {
throw `TTS Voice id ${voiceId} not found`;
}
return match;
}
async generateTts(text, voiceId) {
const response = await this.fetchTtsGeneration(text, voiceId);
return response;
}
//###########//
// API CALLS //
//###########//
async fetchTtsVoiceObjects() {
const response = await fetch(`${this.settings.provider_endpoint}/voice/speakers`);
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${await response.json()}`);
}
const jsonData = await response.json();
const voices = [];
const addVoices = (modelType) => {
jsonData[modelType].forEach(voice => {
voices.push({
name: `[${modelType}] ${voice.name} (${voice.lang})`,
voice_id: `${modelType}&${voice.id}`,
preview_url: false,
lang: voice.lang,
});
});
};
for (const key in this.modelTypes) {
addVoices(this.modelTypes[key]);
}
this.voices = voices; // Assign to the class property
return voices; // Also return this list
}
// Each time a parameter is changed, we change the configuration
async changeTTSSettings() {
}
/**
* Fetch TTS generation from the API.
* @param {string} inputText Text to generate TTS for
* @param {string} voiceId Voice ID to use (model_type&speaker_id))
* @returns {Promise<Response|string>} Fetch response
*/
async fetchTtsGeneration(inputText, voiceId, lang = null, forceNoStreaming = false) {
console.info(`Generating new TTS for voice_id ${voiceId}`);
const streaming = !forceNoStreaming && this.settings.streaming;
const [model_type, speaker_id] = voiceId.split('&');
const params = new URLSearchParams();
params.append('text', inputText);
params.append('id', speaker_id);
if (streaming) {
params.append('streaming', streaming);
// Streaming response only supports MP3
}
else {
params.append('format', this.settings.format);
}
params.append('lang', lang ?? this.settings.lang);
params.append('length', this.settings.length);
params.append('noise', this.settings.noise);
params.append('noisew', this.settings.noisew);
params.append('segment_size', this.settings.segment_size);
if (model_type == this.modelTypes.W2V2_VITS) {
params.append('emotion', this.settings.dim_emotion);
}
else if (model_type == this.modelTypes.BERT_VITS2) {
params.append('sdp_ratio', this.settings.sdp_ratio);
params.append('emotion', this.settings.emotion);
if (this.settings.text_prompt) {
params.append('text_prompt', this.settings.text_prompt);
}
if (this.settings.style_text) {
params.append('style_text', this.settings.style_text);
params.append('style_weight', this.settings.style_weight);
}
}
const url = `${this.settings.provider_endpoint}/voice/${model_type.toLowerCase()}`;
if (streaming) {
return url + `?${params.toString()}`;
}
const response = await fetch(
url,
{
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
},
body: params,
},
);
if (!response.ok) {
toastr.error(response.statusText, 'TTS Generation Failed');
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
}
return response;
}
/**
* Preview TTS for a given voice ID.
* @param {string} id Voice ID
*/
async previewTtsVoice(id) {
this.audioElement.pause();
this.audioElement.currentTime = 0;
const voice = await this.getVoiceById(id);
const lang = voice.lang.includes(this.settings.lang) ? this.settings.lang : voice.lang[0];
let lang_code = this.langKey2LangCode[lang];
const text = getPreviewString(lang_code);
const response = await this.fetchTtsGeneration(text, id, lang, true);
if (typeof response != 'string') {
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
}
const audio = await response.blob();
const url = URL.createObjectURL(audio);
this.audioElement.src = url;
this.audioElement.play();
}
}
// Interface not used
async fetchTtsFromHistory(history_item_id) {
return Promise.resolve(history_item_id);
}
}