mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-06-05 21:59:27 +02:00
Merge branch 'staging' into parser-followup-2
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
import { ensureImageFormatSupported, getBase64Async, isTrueBoolean, saveBase64AsFile } from '../../utils.js';
|
||||
import { getContext, getApiUrl, doExtrasFetch, extension_settings, modules, renderExtensionTemplateAsync } from '../../extensions.js';
|
||||
import { callPopup, getRequestHeaders, saveSettingsDebounced, substituteParamsExtended } from '../../../script.js';
|
||||
import { appendMediaToMessage, callPopup, eventSource, event_types, getRequestHeaders, saveChatConditional, saveSettingsDebounced, substituteParamsExtended } from '../../../script.js';
|
||||
import { getMessageTimeStamp } from '../../RossAscends-mods.js';
|
||||
import { SECRET_KEYS, secret_state } from '../../secrets.js';
|
||||
import { getMultimodalCaption } from '../shared.js';
|
||||
@@ -84,12 +84,11 @@ async function setSpinnerIcon() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a captioned message to the chat.
|
||||
* @param {string} caption Caption text
|
||||
* @param {string} image Image URL
|
||||
* Wraps a caption with a message template.
|
||||
* @param {string} caption Raw caption
|
||||
* @returns {Promise<string>} Wrapped caption
|
||||
*/
|
||||
async function sendCaptionedMessage(caption, image) {
|
||||
const context = getContext();
|
||||
async function wrapCaptionTemplate(caption) {
|
||||
let template = extension_settings.caption.template || TEMPLATE_DEFAULT;
|
||||
|
||||
if (!/{{caption}}/i.test(template)) {
|
||||
@@ -101,7 +100,7 @@ async function sendCaptionedMessage(caption, image) {
|
||||
|
||||
if (extension_settings.caption.refine_mode) {
|
||||
messageText = await callPopup(
|
||||
'<h3>Review and edit the generated message:</h3>Press "Cancel" to abort the caption sending.',
|
||||
'<h3>Review and edit the generated caption:</h3>Press "Cancel" to abort the caption sending.',
|
||||
'input',
|
||||
messageText,
|
||||
{ rows: 5, okButton: 'Send' });
|
||||
@@ -111,6 +110,55 @@ async function sendCaptionedMessage(caption, image) {
|
||||
}
|
||||
}
|
||||
|
||||
return messageText;
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends caption to an existing message.
|
||||
* @param {Object} data Message data
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async function captionExistingMessage(data) {
|
||||
if (!(data?.extra?.image)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const imageData = await fetch(data.extra.image);
|
||||
const blob = await imageData.blob();
|
||||
const type = imageData.headers.get('Content-Type');
|
||||
const file = new File([blob], 'image.png', { type });
|
||||
const caption = await getCaptionForFile(file, null, true);
|
||||
|
||||
if (!caption) {
|
||||
console.warn('Failed to generate a caption for the image.');
|
||||
return;
|
||||
}
|
||||
|
||||
const wrappedCaption = await wrapCaptionTemplate(caption);
|
||||
|
||||
const messageText = String(data.mes).trim();
|
||||
|
||||
if (!messageText) {
|
||||
data.extra.inline_image = false;
|
||||
data.mes = wrappedCaption;
|
||||
data.extra.title = wrappedCaption;
|
||||
}
|
||||
else {
|
||||
data.extra.inline_image = true;
|
||||
data.extra.append_title = true;
|
||||
data.extra.title = wrappedCaption;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a captioned message to the chat.
|
||||
* @param {string} caption Caption text
|
||||
* @param {string} image Image URL
|
||||
*/
|
||||
async function sendCaptionedMessage(caption, image) {
|
||||
const messageText = await wrapCaptionTemplate(caption);
|
||||
|
||||
const context = getContext();
|
||||
const message = {
|
||||
name: context.name1,
|
||||
is_user: true,
|
||||
@@ -356,6 +404,7 @@ jQuery(async function () {
|
||||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'llamacpp' && textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP]) ||
|
||||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'ooba' && textgenerationwebui_settings.server_urls[textgen_types.OOBA]) ||
|
||||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'koboldcpp' && textgenerationwebui_settings.server_urls[textgen_types.KOBOLDCPP]) ||
|
||||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'vllm' && textgenerationwebui_settings.server_urls[textgen_types.VLLM]) ||
|
||||
(extension_settings.caption.source === 'multimodal' && extension_settings.caption.multimodal_api === 'custom') ||
|
||||
extension_settings.caption.source === 'local' ||
|
||||
extension_settings.caption.source === 'horde';
|
||||
@@ -408,7 +457,7 @@ jQuery(async function () {
|
||||
});
|
||||
}
|
||||
async function addSettings() {
|
||||
const html = await renderExtensionTemplateAsync('caption', 'settings');
|
||||
const html = await renderExtensionTemplateAsync('caption', 'settings', { TEMPLATE_DEFAULT, PROMPT_DEFAULT });
|
||||
$('#caption_container').append(html);
|
||||
}
|
||||
|
||||
@@ -422,6 +471,7 @@ jQuery(async function () {
|
||||
$('#caption_refine_mode').prop('checked', !!(extension_settings.caption.refine_mode));
|
||||
$('#caption_allow_reverse_proxy').prop('checked', !!(extension_settings.caption.allow_reverse_proxy));
|
||||
$('#caption_prompt_ask').prop('checked', !!(extension_settings.caption.prompt_ask));
|
||||
$('#caption_auto_mode').prop('checked', !!(extension_settings.caption.auto_mode));
|
||||
$('#caption_source').val(extension_settings.caption.source);
|
||||
$('#caption_prompt').val(extension_settings.caption.prompt);
|
||||
$('#caption_template').val(extension_settings.caption.template);
|
||||
@@ -447,6 +497,41 @@ jQuery(async function () {
|
||||
extension_settings.caption.prompt_ask = $('#caption_prompt_ask').prop('checked');
|
||||
saveSettingsDebounced();
|
||||
});
|
||||
$('#caption_auto_mode').on('input', () => {
|
||||
extension_settings.caption.auto_mode = !!$('#caption_auto_mode').prop('checked');
|
||||
saveSettingsDebounced();
|
||||
});
|
||||
|
||||
const onMessageEvent = async (index) => {
|
||||
if (!extension_settings.caption.auto_mode) {
|
||||
return;
|
||||
}
|
||||
|
||||
const data = getContext().chat[index];
|
||||
await captionExistingMessage(data);
|
||||
};
|
||||
|
||||
eventSource.on(event_types.MESSAGE_SENT, onMessageEvent);
|
||||
eventSource.on(event_types.MESSAGE_FILE_EMBEDDED, onMessageEvent);
|
||||
|
||||
$(document).on('click', '.mes_img_caption', async function () {
|
||||
const animationClass = 'fa-fade';
|
||||
const messageBlock = $(this).closest('.mes');
|
||||
const messageImg = messageBlock.find('.mes_img');
|
||||
if (messageImg.hasClass(animationClass)) return;
|
||||
messageImg.addClass(animationClass);
|
||||
try {
|
||||
const index = Number(messageBlock.attr('mesid'));
|
||||
const data = getContext().chat[index];
|
||||
await captionExistingMessage(data);
|
||||
appendMediaToMessage(data, messageBlock, false);
|
||||
await saveChatConditional();
|
||||
} catch(e) {
|
||||
console.error('Message image recaption failed', e);
|
||||
} finally {
|
||||
messageImg.removeClass(animationClass);
|
||||
}
|
||||
});
|
||||
|
||||
SlashCommandParser.addCommandObject(SlashCommand.fromProps({ name: 'caption',
|
||||
callback: captionCommandCallback,
|
||||
@@ -482,4 +567,6 @@ jQuery(async function () {
|
||||
</div>
|
||||
`,
|
||||
}));
|
||||
|
||||
document.body.classList.add('caption');
|
||||
});
|
||||
|
@@ -26,6 +26,7 @@
|
||||
<option value="openai">OpenAI</option>
|
||||
<option value="openrouter">OpenRouter</option>
|
||||
<option value="ooba" data-i18n="Text Generation WebUI (oobabooga)">Text Generation WebUI (oobabooga)</option>
|
||||
<option value="vllm">vLLM</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex1 flex-container flexFlowColumn flexNoGap">
|
||||
@@ -66,6 +67,7 @@
|
||||
<option data-type="llamacpp" value="llamacpp_current" data-i18n="currently_loaded">[Currently loaded]</option>
|
||||
<option data-type="ooba" value="ooba_current" data-i18n="currently_loaded">[Currently loaded]</option>
|
||||
<option data-type="koboldcpp" value="koboldcpp_current" data-i18n="currently_loaded">[Currently loaded]</option>
|
||||
<option data-type="vllm" value="vllm_current" data-i18n="currently_selected">[Currently selected]</option>
|
||||
<option data-type="custom" value="custom_current" data-i18n="currently_selected">[Currently selected]</option>
|
||||
</select>
|
||||
</div>
|
||||
@@ -82,14 +84,19 @@
|
||||
</div>
|
||||
<div id="caption_prompt_block">
|
||||
<label for="caption_prompt" data-i18n="Caption Prompt">Caption Prompt</label>
|
||||
<textarea id="caption_prompt" class="text_pole" rows="1" placeholder="< Use default >">${PROMPT_DEFAULT}</textarea>
|
||||
<textarea id="caption_prompt" class="text_pole" rows="1" placeholder="< Use default >">{{PROMPT_DEFAULT}}</textarea>
|
||||
<label class="checkbox_label margin-bot-10px" for="caption_prompt_ask" title="Ask for a custom prompt every time an image is captioned.">
|
||||
<input id="caption_prompt_ask" type="checkbox" class="checkbox">
|
||||
<span data-i18n="Ask every time">Ask every time</span>
|
||||
</label>
|
||||
</div>
|
||||
<label for="caption_template"><span data-i18n="Message Template">Message Template</span> <small><span data-i18n="(use _space">(use </span> <code>{{caption}}</code> <span data-i18n="macro)">macro)</span></small></label>
|
||||
<textarea id="caption_template" class="text_pole" rows="2" placeholder="< Use default >">${TEMPLATE_DEFAULT}</textarea>
|
||||
<textarea id="caption_template" class="text_pole" rows="2" placeholder="< Use default >">{{TEMPLATE_DEFAULT}}</textarea>
|
||||
<label class="checkbox_label" for="caption_auto_mode">
|
||||
<input id="caption_auto_mode" type="checkbox" class="checkbox">
|
||||
<span data-i18n="Automatically caption images">Automatically caption images</span>
|
||||
<i class="fa-solid fa-info-circle" title="Automatically caption images when they are pasted into the chat or attached to messages."></i>
|
||||
</label>
|
||||
<label class="checkbox_label margin-bot-10px" for="caption_refine_mode">
|
||||
<input id="caption_refine_mode" type="checkbox" class="checkbox">
|
||||
<span data-i18n="Edit captions before saving">Edit captions before saving</span>
|
||||
|
@@ -34,6 +34,7 @@ export async function getMultimodalCaption(base64Img, prompt) {
|
||||
const isCustom = extension_settings.caption.multimodal_api === 'custom';
|
||||
const isOoba = extension_settings.caption.multimodal_api === 'ooba';
|
||||
const isKoboldCpp = extension_settings.caption.multimodal_api === 'koboldcpp';
|
||||
const isVllm = extension_settings.caption.multimodal_api === 'vllm';
|
||||
const base64Bytes = base64Img.length * 0.75;
|
||||
const compressionLimit = 2 * 1024 * 1024;
|
||||
if ((['google', 'openrouter'].includes(extension_settings.caption.multimodal_api) && base64Bytes > compressionLimit) || isOoba || isKoboldCpp) {
|
||||
@@ -65,6 +66,14 @@ export async function getMultimodalCaption(base64Img, prompt) {
|
||||
requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.OLLAMA];
|
||||
}
|
||||
|
||||
if (isVllm) {
|
||||
if (extension_settings.caption.multimodal_model === 'vllm_current') {
|
||||
requestBody.model = textgenerationwebui_settings.vllm_model;
|
||||
}
|
||||
|
||||
requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.VLLM];
|
||||
}
|
||||
|
||||
if (isLlamaCpp) {
|
||||
requestBody.server_url = textgenerationwebui_settings.server_urls[textgen_types.LLAMACPP];
|
||||
}
|
||||
@@ -151,6 +160,14 @@ function throwIfInvalidModel(useReverseProxy) {
|
||||
throw new Error('KoboldCpp server URL is not set.');
|
||||
}
|
||||
|
||||
if (extension_settings.caption.multimodal_api === 'vllm' && !textgenerationwebui_settings.server_urls[textgen_types.VLLM]) {
|
||||
throw new Error('vLLM server URL is not set.');
|
||||
}
|
||||
|
||||
if (extension_settings.caption.multimodal_api === 'vllm' && extension_settings.caption.multimodal_model === 'vllm_current' && !textgenerationwebui_settings.vllm_model) {
|
||||
throw new Error('vLLM model is not set.');
|
||||
}
|
||||
|
||||
if (extension_settings.caption.multimodal_api === 'custom' && !oai_settings.custom_url) {
|
||||
throw new Error('Custom API URL is not set.');
|
||||
}
|
||||
|
@@ -3,7 +3,6 @@ import {
|
||||
systemUserName,
|
||||
hideSwipeButtons,
|
||||
showSwipeButtons,
|
||||
callPopup,
|
||||
getRequestHeaders,
|
||||
event_types,
|
||||
eventSource,
|
||||
@@ -29,10 +28,9 @@ import { getMultimodalCaption } from '../shared.js';
|
||||
import { SlashCommandParser } from '../../slash-commands/SlashCommandParser.js';
|
||||
import { SlashCommand } from '../../slash-commands/SlashCommand.js';
|
||||
import { ARGUMENT_TYPE, SlashCommandArgument, SlashCommandNamedArgument } from '../../slash-commands/SlashCommandArgument.js';
|
||||
import { resolveVariable } from '../../variables.js';
|
||||
import { debounce_timeout } from '../../constants.js';
|
||||
import { commonEnumProviders } from '../../slash-commands/SlashCommandCommonEnumsProvider.js';
|
||||
import { SlashCommandEnumValue } from '../../slash-commands/SlashCommandEnumValue.js';
|
||||
import { POPUP_TYPE, callGenericPopup } from '../../popup.js';
|
||||
export { MODULE_NAME };
|
||||
|
||||
const MODULE_NAME = 'sd';
|
||||
@@ -572,7 +570,7 @@ async function onDeleteStyleClick() {
|
||||
return;
|
||||
}
|
||||
|
||||
const confirmed = await callPopup(`Are you sure you want to delete the style "${selectedStyle}"?`, 'confirm', '', { okButton: 'Delete' });
|
||||
const confirmed = await callGenericPopup(`Are you sure you want to delete the style "${selectedStyle}"?`, POPUP_TYPE.CONFIRM, '', { okButton: 'Delete', cancelButton: 'Cancel' });
|
||||
|
||||
if (!confirmed) {
|
||||
return;
|
||||
@@ -601,7 +599,7 @@ async function onDeleteStyleClick() {
|
||||
}
|
||||
|
||||
async function onSaveStyleClick() {
|
||||
const userInput = await callPopup('Enter style name:', 'input', '', { okButton: 'Save' });
|
||||
const userInput = await callGenericPopup('Enter style name:', POPUP_TYPE.INPUT);
|
||||
|
||||
if (!userInput) {
|
||||
return;
|
||||
@@ -670,7 +668,7 @@ async function refinePrompt(prompt, allowExpand, isNegative = false) {
|
||||
|
||||
if (extension_settings.sd.refine_mode) {
|
||||
const text = isNegative ? '<h3>Review and edit the <i>negative</i> prompt:</h3>' : '<h3>Review and edit the prompt:</h3>';
|
||||
const refinedPrompt = await callPopup(text + 'Press "Cancel" to abort the image generation.', 'input', prompt.trim(), { rows: 5, okButton: 'Continue' });
|
||||
const refinedPrompt = await callGenericPopup(text + 'Press "Cancel" to abort the image generation.', POPUP_TYPE.INPUT, prompt.trim(), { rows: 5, okButton: 'Continue' });
|
||||
|
||||
if (refinedPrompt) {
|
||||
return refinedPrompt;
|
||||
@@ -2918,25 +2916,25 @@ async function generateComfyImage(prompt, negativePrompt) {
|
||||
const text = await workflowResponse.text();
|
||||
toastr.error(`Failed to load workflow.\n\n${text}`);
|
||||
}
|
||||
let workflow = (await workflowResponse.json()).replace('"%prompt%"', JSON.stringify(prompt));
|
||||
workflow = workflow.replace('"%negative_prompt%"', JSON.stringify(negativePrompt));
|
||||
let workflow = (await workflowResponse.json()).replaceAll('"%prompt%"', JSON.stringify(prompt));
|
||||
workflow = workflow.replaceAll('"%negative_prompt%"', JSON.stringify(negativePrompt));
|
||||
|
||||
const seed = extension_settings.sd.seed >= 0 ? extension_settings.sd.seed : Math.round(Math.random() * Number.MAX_SAFE_INTEGER);
|
||||
workflow = workflow.replaceAll('"%seed%"', JSON.stringify(seed));
|
||||
placeholders.forEach(ph => {
|
||||
workflow = workflow.replace(`"%${ph}%"`, JSON.stringify(extension_settings.sd[ph]));
|
||||
workflow = workflow.replaceAll(`"%${ph}%"`, JSON.stringify(extension_settings.sd[ph]));
|
||||
});
|
||||
(extension_settings.sd.comfy_placeholders ?? []).forEach(ph => {
|
||||
workflow = workflow.replace(`"%${ph.find}%"`, JSON.stringify(substituteParams(ph.replace)));
|
||||
workflow = workflow.replaceAll(`"%${ph.find}%"`, JSON.stringify(substituteParams(ph.replace)));
|
||||
});
|
||||
if (/%user_avatar%/gi.test(workflow)) {
|
||||
const response = await fetch(getUserAvatarUrl());
|
||||
if (response.ok) {
|
||||
const avatarBlob = await response.blob();
|
||||
const avatarBase64 = await getBase64Async(avatarBlob);
|
||||
workflow = workflow.replace('"%user_avatar%"', JSON.stringify(avatarBase64));
|
||||
workflow = workflow.replaceAll('"%user_avatar%"', JSON.stringify(avatarBase64));
|
||||
} else {
|
||||
workflow = workflow.replace('"%user_avatar%"', JSON.stringify(PNG_PIXEL));
|
||||
workflow = workflow.replaceAll('"%user_avatar%"', JSON.stringify(PNG_PIXEL));
|
||||
}
|
||||
}
|
||||
if (/%char_avatar%/gi.test(workflow)) {
|
||||
@@ -2944,9 +2942,9 @@ async function generateComfyImage(prompt, negativePrompt) {
|
||||
if (response.ok) {
|
||||
const avatarBlob = await response.blob();
|
||||
const avatarBase64 = await getBase64Async(avatarBlob);
|
||||
workflow = workflow.replace('"%char_avatar%"', JSON.stringify(avatarBase64));
|
||||
workflow = workflow.replaceAll('"%char_avatar%"', JSON.stringify(avatarBase64));
|
||||
} else {
|
||||
workflow = workflow.replace('"%char_avatar%"', JSON.stringify(PNG_PIXEL));
|
||||
workflow = workflow.replaceAll('"%char_avatar%"', JSON.stringify(PNG_PIXEL));
|
||||
}
|
||||
}
|
||||
console.log(`{
|
||||
@@ -2978,7 +2976,7 @@ async function onComfyOpenWorkflowEditorClick() {
|
||||
}),
|
||||
})).json();
|
||||
const editorHtml = $(await $.get('scripts/extensions/stable-diffusion/comfyWorkflowEditor.html'));
|
||||
const popupResult = callPopup(editorHtml, 'confirm', undefined, { okButton: 'Save', wide: true, large: true, rows: 1 });
|
||||
const popupResult = callGenericPopup(editorHtml, POPUP_TYPE.CONFIRM, '', { okButton: 'Save', cancelButton: 'Cancel', wide: true, large: true });
|
||||
const checkPlaceholders = () => {
|
||||
workflow = $('#sd_comfy_workflow_editor_workflow').val().toString();
|
||||
$('.sd_comfy_workflow_editor_placeholder_list > li[data-placeholder]').each(function (idx) {
|
||||
@@ -3058,7 +3056,7 @@ async function onComfyOpenWorkflowEditorClick() {
|
||||
}
|
||||
|
||||
async function onComfyNewWorkflowClick() {
|
||||
let name = await callPopup('<h3>Workflow name:</h3>', 'input');
|
||||
let name = await callGenericPopup('Workflow name:', POPUP_TYPE.INPUT);
|
||||
if (!name) {
|
||||
return;
|
||||
}
|
||||
@@ -3085,7 +3083,7 @@ async function onComfyNewWorkflowClick() {
|
||||
}
|
||||
|
||||
async function onComfyDeleteWorkflowClick() {
|
||||
const confirm = await callPopup('Delete the workflow? This action is irreversible.', 'confirm');
|
||||
const confirm = await callGenericPopup('Delete the workflow? This action is irreversible.', POPUP_TYPE.CONFIRM, '', { okButton: 'Delete', cancelButton: 'Cancel' });
|
||||
if (!confirm) {
|
||||
return;
|
||||
}
|
||||
|
@@ -1,4 +1,5 @@
|
||||
import { callPopup, getRequestHeaders } from '../../../script.js';
|
||||
import { getRequestHeaders } from '../../../script.js';
|
||||
import { POPUP_TYPE, callGenericPopup } from '../../popup.js';
|
||||
import { SECRET_KEYS, findSecret, secret_state, writeSecret } from '../../secrets.js';
|
||||
import { getPreviewString, saveTtsProviderSettings } from './index.js';
|
||||
export { AzureTtsProvider };
|
||||
@@ -69,13 +70,13 @@ class AzureTtsProvider {
|
||||
const popupText = 'Azure TTS API Key';
|
||||
const savedKey = secret_state[SECRET_KEYS.AZURE_TTS] ? await findSecret(SECRET_KEYS.AZURE_TTS) : '';
|
||||
|
||||
const key = await callPopup(popupText, 'input', savedKey);
|
||||
const key = await callGenericPopup(popupText, POPUP_TYPE.INPUT, savedKey);
|
||||
|
||||
if (key == false || key == '') {
|
||||
return;
|
||||
}
|
||||
|
||||
await writeSecret(SECRET_KEYS.AZURE_TTS, key);
|
||||
await writeSecret(SECRET_KEYS.AZURE_TTS, String(key));
|
||||
|
||||
toastr.success('API Key saved');
|
||||
$('#azure_tts_key').addClass('success');
|
||||
|
@@ -5,8 +5,8 @@ TODO:
|
||||
*/
|
||||
|
||||
import { doExtrasFetch, extension_settings, getApiUrl, modules } from '../../extensions.js';
|
||||
import { callPopup } from '../../../script.js';
|
||||
import { initVoiceMap } from './index.js';
|
||||
import { POPUP_TYPE, callGenericPopup } from '../../popup.js';
|
||||
|
||||
export { CoquiTtsProvider };
|
||||
|
||||
@@ -246,7 +246,7 @@ class CoquiTtsProvider {
|
||||
}
|
||||
|
||||
// Ask user for voiceId name to save voice
|
||||
const voiceName = await callPopup('<h3>Name of Coqui voice to add to voice select dropdown:</h3>', 'input');
|
||||
const voiceName = await callGenericPopup('Name of Coqui voice to add to voice select dropdown:', POPUP_TYPE.INPUT);
|
||||
|
||||
const model_origin = $('#coqui_model_origin').val();
|
||||
const model_language = $('#coqui_api_language').val();
|
||||
|
@@ -1,4 +1,4 @@
|
||||
import { cancelTtsPlay, eventSource, event_types, isStreamingEnabled, name2, saveSettingsDebounced, substituteParams } from '../../../script.js';
|
||||
import { cancelTtsPlay, eventSource, event_types, getCurrentChatId, isStreamingEnabled, name2, saveSettingsDebounced, substituteParams } from '../../../script.js';
|
||||
import { ModuleWorkerWrapper, doExtrasFetch, extension_settings, getApiUrl, getContext, modules, renderExtensionTemplateAsync } from '../../extensions.js';
|
||||
import { delay, escapeRegex, getBase64Async, getStringHash, onlyUnique } from '../../utils.js';
|
||||
import { EdgeTtsProvider } from './edge.js';
|
||||
@@ -10,6 +10,7 @@ import { NovelTtsProvider } from './novel.js';
|
||||
import { power_user } from '../../power-user.js';
|
||||
import { OpenAITtsProvider } from './openai.js';
|
||||
import { XTTSTtsProvider } from './xtts.js';
|
||||
import { VITSTtsProvider } from './vits.js';
|
||||
import { GSVITtsProvider } from './gsvi.js';
|
||||
import { SBVits2TtsProvider } from './sbvits2.js';
|
||||
import { AllTalkTtsProvider } from './alltalk.js';
|
||||
@@ -34,6 +35,7 @@ let lastMessage = null;
|
||||
let lastMessageHash = null;
|
||||
let periodicMessageGenerationTimer = null;
|
||||
let lastPositionOfParagraphEnd = -1;
|
||||
let currentInitVoiceMapPromise = null;
|
||||
|
||||
const DEFAULT_VOICE_MARKER = '[Default Voice]';
|
||||
const DISABLED_VOICE_MARKER = 'disabled';
|
||||
@@ -83,6 +85,7 @@ const ttsProviders = {
|
||||
ElevenLabs: ElevenLabsTtsProvider,
|
||||
Silero: SileroTtsProvider,
|
||||
XTTSv2: XTTSTtsProvider,
|
||||
VITS: VITSTtsProvider,
|
||||
GSVI: GSVITtsProvider,
|
||||
SBVits2: SBVits2TtsProvider,
|
||||
System: SystemTtsProvider,
|
||||
@@ -1008,9 +1011,39 @@ class VoiceMapEntry {
|
||||
|
||||
/**
|
||||
* Init voiceMapEntries for character select list.
|
||||
* If an initialization is already in progress, it returns the existing Promise instead of starting a new one.
|
||||
* @param {boolean} unrestricted - If true, will include all characters in voiceMapEntries, even if they are not in the current chat.
|
||||
* @returns {Promise} A promise that resolves when the initialization is complete.
|
||||
*/
|
||||
export async function initVoiceMap(unrestricted = false) {
|
||||
// Preventing parallel execution
|
||||
if (currentInitVoiceMapPromise) {
|
||||
return currentInitVoiceMapPromise;
|
||||
}
|
||||
|
||||
currentInitVoiceMapPromise = (async () => {
|
||||
const initialChatId = getCurrentChatId();
|
||||
try {
|
||||
await initVoiceMapInternal(unrestricted);
|
||||
} finally {
|
||||
currentInitVoiceMapPromise = null;
|
||||
}
|
||||
const currentChatId = getCurrentChatId();
|
||||
|
||||
if (initialChatId !== currentChatId) {
|
||||
// Chat changed during initialization, reinitialize
|
||||
await initVoiceMap(unrestricted);
|
||||
}
|
||||
})();
|
||||
|
||||
return currentInitVoiceMapPromise;
|
||||
}
|
||||
|
||||
/**
|
||||
* Init voiceMapEntries for character select list.
|
||||
* @param {boolean} unrestricted - If true, will include all characters in voiceMapEntries, even if they are not in the current chat.
|
||||
*/
|
||||
async function initVoiceMapInternal(unrestricted) {
|
||||
// Gate initialization if not enabled or TTS Provider not ready. Prevents error popups.
|
||||
const enabled = $('#tts_enabled').is(':checked');
|
||||
if (!enabled) {
|
||||
|
@@ -1,4 +1,5 @@
|
||||
import { getRequestHeaders, callPopup } from '../../../script.js';
|
||||
import { getRequestHeaders } from '../../../script.js';
|
||||
import { POPUP_TYPE, callGenericPopup } from '../../popup.js';
|
||||
import { splitRecursive } from '../../utils.js';
|
||||
import { getPreviewString, saveTtsProviderSettings } from './index.js';
|
||||
import { initVoiceMap } from './index.js';
|
||||
@@ -56,7 +57,7 @@ class NovelTtsProvider {
|
||||
|
||||
// Add a new Novel custom voice to provider
|
||||
async addCustomVoice() {
|
||||
const voiceName = await callPopup('<h3>Custom Voice name:</h3>', 'input');
|
||||
const voiceName = await callGenericPopup('Custom Voice name:', POPUP_TYPE.INPUT);
|
||||
this.settings.customVoices.push(voiceName);
|
||||
this.populateCustomVoices();
|
||||
initVoiceMap(); // Update TTS extension voiceMap
|
||||
|
404
public/scripts/extensions/tts/vits.js
Normal file
404
public/scripts/extensions/tts/vits.js
Normal file
@@ -0,0 +1,404 @@
|
||||
import { getPreviewString, saveTtsProviderSettings } from './index.js';
|
||||
|
||||
export { VITSTtsProvider };
|
||||
|
||||
class VITSTtsProvider {
|
||||
//########//
|
||||
// Config //
|
||||
//########//
|
||||
|
||||
settings;
|
||||
ready = false;
|
||||
voices = [];
|
||||
separator = '. ';
|
||||
audioElement = document.createElement('audio');
|
||||
|
||||
/**
|
||||
* Perform any text processing before passing to TTS engine.
|
||||
* @param {string} text Input text
|
||||
* @returns {string} Processed text
|
||||
*/
|
||||
processText(text) {
|
||||
return text;
|
||||
}
|
||||
|
||||
audioFormats = ['wav', 'ogg', 'silk', 'mp3', 'flac'];
|
||||
|
||||
languageLabels = {
|
||||
'Auto': 'auto',
|
||||
'Chinese': 'zh',
|
||||
'English': 'en',
|
||||
'Japanese': 'ja',
|
||||
'Korean': 'ko',
|
||||
};
|
||||
|
||||
langKey2LangCode = {
|
||||
'zh': 'zh-CN',
|
||||
'en': 'en-US',
|
||||
'ja': 'ja-JP',
|
||||
'ko': 'ko-KR',
|
||||
};
|
||||
|
||||
modelTypes = {
|
||||
VITS: 'VITS',
|
||||
W2V2_VITS: 'W2V2-VITS',
|
||||
BERT_VITS2: 'BERT-VITS2',
|
||||
};
|
||||
|
||||
defaultSettings = {
|
||||
provider_endpoint: 'http://localhost:23456',
|
||||
format: 'wav',
|
||||
lang: 'auto',
|
||||
length: 1.0,
|
||||
noise: 0.33,
|
||||
noisew: 0.4,
|
||||
segment_size: 50,
|
||||
streaming: false,
|
||||
dim_emotion: 0,
|
||||
sdp_ratio: 0.2,
|
||||
emotion: 0,
|
||||
text_prompt: '',
|
||||
style_text: '',
|
||||
style_weight: 1,
|
||||
};
|
||||
|
||||
get settingsHtml() {
|
||||
let html = `
|
||||
<label for="vits_lang">Text Language</label>
|
||||
<select id="vits_lang">`;
|
||||
|
||||
for (let language in this.languageLabels) {
|
||||
if (this.languageLabels[language] == this.settings?.lang) {
|
||||
html += `<option value="${this.languageLabels[language]}" selected="selected">${language}</option>`;
|
||||
continue;
|
||||
}
|
||||
html += `<option value="${this.languageLabels[language]}">${language}</option>`;
|
||||
}
|
||||
|
||||
html += `
|
||||
</select>
|
||||
<label>VITS / W2V2-VITS / Bert-VITS2 Settings:</label><br/>
|
||||
<label for="vits_endpoint">Provider Endpoint:</label>
|
||||
<input id="vits_endpoint" type="text" class="text_pole" maxlength="250" value="${this.defaultSettings.provider_endpoint}"/>
|
||||
<span>Use <a target="_blank" href="https://github.com/Artrajz/vits-simple-api">vits-simple-api</a>.</span><br/>
|
||||
|
||||
<label for="vits_format">Audio format:</label>
|
||||
<select id="vits_format">`;
|
||||
|
||||
for (let format of this.audioFormats) {
|
||||
if (format == this.settings?.format) {
|
||||
html += `<option value="${format}" selected="selected">${format}</option>`;
|
||||
continue;
|
||||
}
|
||||
html += `<option value="${format}">${format}</option>`;
|
||||
}
|
||||
|
||||
html += `
|
||||
</select>
|
||||
<label for="vits_length">Audio length: <span id="vits_length_output">${this.defaultSettings.length}</span></label>
|
||||
<input id="vits_length" type="range" value="${this.defaultSettings.length}" min="0.0" max="5" step="0.01" />
|
||||
|
||||
<label for="vits_noise">Noise: <span id="vits_noise_output">${this.defaultSettings.noise}</span></label>
|
||||
<input id="vits_noise" type="range" value="${this.defaultSettings.noise}" min="0.1" max="2" step="0.01" />
|
||||
|
||||
<label for="vits_noisew">SDP noise: <span id="vits_noisew_output">${this.defaultSettings.noisew}</span></label>
|
||||
<input id="vits_noisew" type="range" value="${this.defaultSettings.noisew}" min="0.1" max="2" step="0.01" />
|
||||
|
||||
<label for="vits_segment_size">Segment Size: <span id="vits_segment_size_output">${this.defaultSettings.segment_size}</span></label>
|
||||
<input id="vits_segment_size" type="range" value="${this.defaultSettings.segment_size}" min="0" max="1000" step="1" />
|
||||
|
||||
<label for="vits_streaming" class="checkbox_label">
|
||||
<input id="vits_streaming" type="checkbox" />
|
||||
<span>Streaming</span>
|
||||
</label>
|
||||
|
||||
<label>W2V2-VITS Settings:</label><br/>
|
||||
<label for="vits_dim_emotion">Dimensional emotion:</label>
|
||||
<input id="vits_dim_emotion" type="number" class="text_pole" min="0" max="5457" step="1" value="${this.defaultSettings.dim_emotion}"/>
|
||||
|
||||
<label>BERT-VITS2 Settings:</label><br/>
|
||||
<label for="vits_sdp_ratio">sdp_ratio: <span id="vits_sdp_ratio_output">${this.defaultSettings.sdp_ratio}</span></label>
|
||||
<input id="vits_sdp_ratio" type="range" value="${this.defaultSettings.sdp_ratio}" min="0.0" max="1" step="0.01" />
|
||||
|
||||
<label for="vits_emotion">emotion: <span id="vits_emotion_output">${this.defaultSettings.emotion}</span></label>
|
||||
<input id="vits_emotion" type="range" value="${this.defaultSettings.emotion}" min="0" max="9" step="1" />
|
||||
|
||||
<label for="vits_text_prompt">Text Prompt:</label>
|
||||
<input id="vits_text_prompt" type="text" class="text_pole" maxlength="512" value="${this.defaultSettings.text_prompt}"/>
|
||||
|
||||
<label for="vits_style_text">Style text:</label>
|
||||
<input id="vits_style_text" type="text" class="text_pole" maxlength="512" value="${this.defaultSettings.style_text}"/>
|
||||
|
||||
<label for="vits_style_weight">Style weight <span id="vits_style_weight_output">${this.defaultSettings.style_weight}</span></label>
|
||||
<input id="vits_style_weight" type="range" value="${this.defaultSettings.style_weight}" min="0" max="1" step="0.01" />
|
||||
`;
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
onSettingsChange() {
|
||||
// Used when provider settings are updated from UI
|
||||
this.settings.provider_endpoint = $('#vits_endpoint').val();
|
||||
this.settings.lang = $('#vits_lang').val();
|
||||
this.settings.format = $('#vits_format').val();
|
||||
this.settings.dim_emotion = $('#vits_dim_emotion').val();
|
||||
this.settings.text_prompt = $('#vits_text_prompt').val();
|
||||
this.settings.style_text = $('#vits_style_text').val();
|
||||
|
||||
// Update the default TTS settings based on input fields
|
||||
this.settings.length = $('#vits_length').val();
|
||||
this.settings.noise = $('#vits_noise').val();
|
||||
this.settings.noisew = $('#vits_noisew').val();
|
||||
this.settings.segment_size = $('#vits_segment_size').val();
|
||||
this.settings.streaming = $('#vits_streaming').is(':checked');
|
||||
this.settings.sdp_ratio = $('#vits_sdp_ratio').val();
|
||||
this.settings.emotion = $('#vits_emotion').val();
|
||||
this.settings.style_weight = $('#vits_style_weight').val();
|
||||
|
||||
// Update the UI to reflect changes
|
||||
$('#vits_length_output').text(this.settings.length);
|
||||
$('#vits_noise_output').text(this.settings.noise);
|
||||
$('#vits_noisew_output').text(this.settings.noisew);
|
||||
$('#vits_segment_size_output').text(this.settings.segment_size);
|
||||
$('#vits_sdp_ratio_output').text(this.settings.sdp_ratio);
|
||||
$('#vits_emotion_output').text(this.settings.emotion);
|
||||
$('#vits_style_weight_output').text(this.settings.style_weight);
|
||||
|
||||
saveTtsProviderSettings();
|
||||
this.changeTTSSettings();
|
||||
}
|
||||
|
||||
async loadSettings(settings) {
|
||||
// Pupulate Provider UI given input settings
|
||||
if (Object.keys(settings).length == 0) {
|
||||
console.info('Using default TTS Provider settings');
|
||||
}
|
||||
|
||||
// Only accept keys defined in defaultSettings
|
||||
this.settings = this.defaultSettings;
|
||||
|
||||
for (const key in settings) {
|
||||
if (key in this.settings) {
|
||||
this.settings[key] = settings[key];
|
||||
} else {
|
||||
console.debug(`Ignoring non-user-configurable setting: ${key}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Set initial values from the settings
|
||||
$('#vits_endpoint').val(this.settings.provider_endpoint);
|
||||
$('#vits_lang').val(this.settings.lang);
|
||||
$('#vits_format').val(this.settings.format);
|
||||
$('#vits_length').val(this.settings.length);
|
||||
$('#vits_noise').val(this.settings.noise);
|
||||
$('#vits_noisew').val(this.settings.noisew);
|
||||
$('#vits_segment_size').val(this.settings.segment_size);
|
||||
$('#vits_streaming').prop('checked', this.settings.streaming);
|
||||
$('#vits_dim_emotion').val(this.settings.dim_emotion);
|
||||
$('#vits_sdp_ratio').val(this.settings.sdp_ratio);
|
||||
$('#vits_emotion').val(this.settings.emotion);
|
||||
$('#vits_text_prompt').val(this.settings.text_prompt);
|
||||
$('#vits_style_text').val(this.settings.style_text);
|
||||
$('#vits_style_weight').val(this.settings.style_weight);
|
||||
|
||||
// Update the UI to reflect changes
|
||||
$('#vits_length_output').text(this.settings.length);
|
||||
$('#vits_noise_output').text(this.settings.noise);
|
||||
$('#vits_noisew_output').text(this.settings.noisew);
|
||||
$('#vits_segment_size_output').text(this.settings.segment_size);
|
||||
$('#vits_sdp_ratio_output').text(this.settings.sdp_ratio);
|
||||
$('#vits_emotion_output').text(this.settings.emotion);
|
||||
$('#vits_style_weight_output').text(this.settings.style_weight);
|
||||
|
||||
// Register input/change event listeners to update settings on user interaction
|
||||
$('#vits_endpoint').on('input', () => { this.onSettingsChange(); });
|
||||
$('#vits_lang').on('change', () => { this.onSettingsChange(); });
|
||||
$('#vits_format').on('change', () => { this.onSettingsChange(); });
|
||||
$('#vits_length').on('change', () => { this.onSettingsChange(); });
|
||||
$('#vits_noise').on('change', () => { this.onSettingsChange(); });
|
||||
$('#vits_noisew').on('change', () => { this.onSettingsChange(); });
|
||||
$('#vits_segment_size').on('change', () => { this.onSettingsChange(); });
|
||||
$('#vits_streaming').on('change', () => { this.onSettingsChange(); });
|
||||
$('#vits_dim_emotion').on('change', () => { this.onSettingsChange(); });
|
||||
$('#vits_sdp_ratio').on('change', () => { this.onSettingsChange(); });
|
||||
$('#vits_emotion').on('change', () => { this.onSettingsChange(); });
|
||||
$('#vits_text_prompt').on('change', () => { this.onSettingsChange(); });
|
||||
$('#vits_style_text').on('change', () => { this.onSettingsChange(); });
|
||||
$('#vits_style_weight').on('change', () => { this.onSettingsChange(); });
|
||||
|
||||
await this.checkReady();
|
||||
|
||||
console.info('VITS: Settings loaded');
|
||||
}
|
||||
|
||||
// Perform a simple readiness check by trying to fetch voiceIds
|
||||
async checkReady() {
|
||||
await Promise.allSettled([this.fetchTtsVoiceObjects(), this.changeTTSSettings()]);
|
||||
}
|
||||
|
||||
async onRefreshClick() {
|
||||
return;
|
||||
}
|
||||
|
||||
//#################//
|
||||
// TTS Interfaces //
|
||||
//#################//
|
||||
|
||||
async getVoice(voiceName) {
|
||||
if (this.voices.length == 0) {
|
||||
this.voices = await this.fetchTtsVoiceObjects();
|
||||
}
|
||||
const match = this.voices.filter(
|
||||
v => v.name == voiceName,
|
||||
)[0];
|
||||
if (!match) {
|
||||
throw `TTS Voice name ${voiceName} not found`;
|
||||
}
|
||||
return match;
|
||||
}
|
||||
|
||||
async getVoiceById(voiceId) {
|
||||
if (this.voices.length == 0) {
|
||||
this.voices = await this.fetchTtsVoiceObjects();
|
||||
}
|
||||
const match = this.voices.filter(
|
||||
v => v.voice_id == voiceId,
|
||||
)[0];
|
||||
if (!match) {
|
||||
throw `TTS Voice id ${voiceId} not found`;
|
||||
}
|
||||
return match;
|
||||
}
|
||||
|
||||
async generateTts(text, voiceId) {
|
||||
const response = await this.fetchTtsGeneration(text, voiceId);
|
||||
return response;
|
||||
}
|
||||
|
||||
//###########//
|
||||
// API CALLS //
|
||||
//###########//
|
||||
async fetchTtsVoiceObjects() {
|
||||
const response = await fetch(`${this.settings.provider_endpoint}/voice/speakers`);
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${await response.json()}`);
|
||||
}
|
||||
const jsonData = await response.json();
|
||||
const voices = [];
|
||||
|
||||
const addVoices = (modelType) => {
|
||||
jsonData[modelType].forEach(voice => {
|
||||
voices.push({
|
||||
name: `[${modelType}] ${voice.name} (${voice.lang})`,
|
||||
voice_id: `${modelType}&${voice.id}`,
|
||||
preview_url: false,
|
||||
lang: voice.lang,
|
||||
});
|
||||
});
|
||||
};
|
||||
for (const key in this.modelTypes) {
|
||||
addVoices(this.modelTypes[key]);
|
||||
}
|
||||
|
||||
this.voices = voices; // Assign to the class property
|
||||
return voices; // Also return this list
|
||||
}
|
||||
|
||||
// Each time a parameter is changed, we change the configuration
|
||||
async changeTTSSettings() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch TTS generation from the API.
|
||||
* @param {string} inputText Text to generate TTS for
|
||||
* @param {string} voiceId Voice ID to use (model_type&speaker_id))
|
||||
* @returns {Promise<Response|string>} Fetch response
|
||||
*/
|
||||
async fetchTtsGeneration(inputText, voiceId, lang = null, forceNoStreaming = false) {
|
||||
console.info(`Generating new TTS for voice_id ${voiceId}`);
|
||||
|
||||
const streaming = !forceNoStreaming && this.settings.streaming;
|
||||
const [model_type, speaker_id] = voiceId.split('&');
|
||||
const params = new URLSearchParams();
|
||||
params.append('text', inputText);
|
||||
params.append('id', speaker_id);
|
||||
if (streaming) {
|
||||
params.append('streaming', streaming);
|
||||
// Streaming response only supports MP3
|
||||
}
|
||||
else {
|
||||
params.append('format', this.settings.format);
|
||||
}
|
||||
params.append('lang', lang ?? this.settings.lang);
|
||||
params.append('length', this.settings.length);
|
||||
params.append('noise', this.settings.noise);
|
||||
params.append('noisew', this.settings.noisew);
|
||||
params.append('segment_size', this.settings.segment_size);
|
||||
|
||||
if (model_type == this.modelTypes.W2V2_VITS) {
|
||||
params.append('emotion', this.settings.dim_emotion);
|
||||
}
|
||||
else if (model_type == this.modelTypes.BERT_VITS2) {
|
||||
params.append('sdp_ratio', this.settings.sdp_ratio);
|
||||
params.append('emotion', this.settings.emotion);
|
||||
if (this.settings.text_prompt) {
|
||||
params.append('text_prompt', this.settings.text_prompt);
|
||||
}
|
||||
if (this.settings.style_text) {
|
||||
params.append('style_text', this.settings.style_text);
|
||||
params.append('style_weight', this.settings.style_weight);
|
||||
}
|
||||
}
|
||||
|
||||
const url = `${this.settings.provider_endpoint}/voice/${model_type.toLowerCase()}`;
|
||||
|
||||
if (streaming) {
|
||||
return url + `?${params.toString()}`;
|
||||
}
|
||||
|
||||
const response = await fetch(
|
||||
url,
|
||||
{
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
},
|
||||
body: params,
|
||||
},
|
||||
);
|
||||
if (!response.ok) {
|
||||
toastr.error(response.statusText, 'TTS Generation Failed');
|
||||
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
|
||||
}
|
||||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Preview TTS for a given voice ID.
|
||||
* @param {string} id Voice ID
|
||||
*/
|
||||
async previewTtsVoice(id) {
|
||||
this.audioElement.pause();
|
||||
this.audioElement.currentTime = 0;
|
||||
const voice = await this.getVoiceById(id);
|
||||
const lang = voice.lang.includes(this.settings.lang) ? this.settings.lang : voice.lang[0];
|
||||
|
||||
let lang_code = this.langKey2LangCode[lang];
|
||||
const text = getPreviewString(lang_code);
|
||||
const response = await this.fetchTtsGeneration(text, id, lang, true);
|
||||
if (typeof response != 'string') {
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
|
||||
}
|
||||
const audio = await response.blob();
|
||||
const url = URL.createObjectURL(audio);
|
||||
this.audioElement.src = url;
|
||||
this.audioElement.play();
|
||||
}
|
||||
}
|
||||
|
||||
// Interface not used
|
||||
async fetchTtsFromHistory(history_item_id) {
|
||||
return Promise.resolve(history_item_id);
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user