mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-06-05 21:59:27 +02:00
Merge branch 'staging' into X-T-E-R/release
This commit is contained in:
@@ -433,8 +433,8 @@ class AllTalkTtsProvider {
|
||||
updateLanguageDropdown() {
|
||||
const languageSelect = document.getElementById('language_options');
|
||||
if (languageSelect) {
|
||||
// Ensure default language is set
|
||||
this.settings.language = this.settings.language;
|
||||
// Ensure default language is set (??? whatever that means)
|
||||
// this.settings.language = this.settings.language;
|
||||
|
||||
languageSelect.innerHTML = '';
|
||||
for (let language in this.languageLabels) {
|
||||
|
@@ -6,6 +6,11 @@ import { saveTtsProviderSettings } from './index.js';
|
||||
|
||||
export { EdgeTtsProvider };
|
||||
|
||||
const EDGE_TTS_PROVIDER = {
|
||||
extras: 'extras',
|
||||
plugin: 'plugin',
|
||||
};
|
||||
|
||||
class EdgeTtsProvider {
|
||||
//########//
|
||||
// Config //
|
||||
@@ -19,18 +24,26 @@ class EdgeTtsProvider {
|
||||
defaultSettings = {
|
||||
voiceMap: {},
|
||||
rate: 0,
|
||||
provider: EDGE_TTS_PROVIDER.extras,
|
||||
};
|
||||
|
||||
get settingsHtml() {
|
||||
let html = `Microsoft Edge TTS Provider<br>
|
||||
let html = `Microsoft Edge TTS<br>
|
||||
<label for="edge_tts_provider">Provider</label>
|
||||
<select id="edge_tts_provider">
|
||||
<option value="${EDGE_TTS_PROVIDER.extras}">Extras</option>
|
||||
<option value="${EDGE_TTS_PROVIDER.plugin}">Plugin</option>
|
||||
</select>
|
||||
<label for="edge_tts_rate">Rate: <span id="edge_tts_rate_output"></span></label>
|
||||
<input id="edge_tts_rate" type="range" value="${this.defaultSettings.rate}" min="-100" max="100" step="1" />`;
|
||||
<input id="edge_tts_rate" type="range" value="${this.defaultSettings.rate}" min="-100" max="100" step="1" />
|
||||
`;
|
||||
return html;
|
||||
}
|
||||
|
||||
onSettingsChange() {
|
||||
this.settings.rate = Number($('#edge_tts_rate').val());
|
||||
$('#edge_tts_rate_output').text(this.settings.rate);
|
||||
this.settings.provider = String($('#edge_tts_provider').val());
|
||||
saveTtsProviderSettings();
|
||||
}
|
||||
|
||||
@@ -53,16 +66,19 @@ class EdgeTtsProvider {
|
||||
|
||||
$('#edge_tts_rate').val(this.settings.rate || 0);
|
||||
$('#edge_tts_rate_output').text(this.settings.rate || 0);
|
||||
$('#edge_tts_rate').on('input', () => {this.onSettingsChange();});
|
||||
$('#edge_tts_rate').on('input', () => { this.onSettingsChange(); });
|
||||
$('#edge_tts_provider').val(this.settings.provider || EDGE_TTS_PROVIDER.extras);
|
||||
$('#edge_tts_provider').on('change', () => { this.onSettingsChange(); });
|
||||
await this.checkReady();
|
||||
|
||||
console.debug('EdgeTTS: Settings loaded');
|
||||
}
|
||||
|
||||
|
||||
// Perform a simple readiness check by trying to fetch voiceIds
|
||||
async checkReady(){
|
||||
throwIfModuleMissing();
|
||||
/**
|
||||
* Perform a simple readiness check by trying to fetch voiceIds
|
||||
*/
|
||||
async checkReady() {
|
||||
await this.throwIfModuleMissing();
|
||||
await this.fetchTtsVoiceObjects();
|
||||
}
|
||||
|
||||
@@ -74,6 +90,11 @@ class EdgeTtsProvider {
|
||||
// TTS Interfaces //
|
||||
//#################//
|
||||
|
||||
/**
|
||||
* Get a voice from the TTS provider.
|
||||
* @param {string} voiceName Voice name to get
|
||||
* @returns {Promise<Object>} Voice object
|
||||
*/
|
||||
async getVoice(voiceName) {
|
||||
if (this.voices.length == 0) {
|
||||
this.voices = await this.fetchTtsVoiceObjects();
|
||||
@@ -87,6 +108,12 @@ class EdgeTtsProvider {
|
||||
return match;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate TTS for a given text.
|
||||
* @param {string} text Text to generate TTS for
|
||||
* @param {string} voiceId Voice ID to use
|
||||
* @returns {Promise<Response>} Fetch response
|
||||
*/
|
||||
async generateTts(text, voiceId) {
|
||||
const response = await this.fetchTtsGeneration(text, voiceId);
|
||||
return response;
|
||||
@@ -96,11 +123,10 @@ class EdgeTtsProvider {
|
||||
// API CALLS //
|
||||
//###########//
|
||||
async fetchTtsVoiceObjects() {
|
||||
throwIfModuleMissing();
|
||||
await this.throwIfModuleMissing();
|
||||
|
||||
const url = new URL(getApiUrl());
|
||||
url.pathname = '/api/edge-tts/list';
|
||||
const response = await doExtrasFetch(url);
|
||||
const url = this.getVoicesUrl();
|
||||
const response = await this.doFetch(url);
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
|
||||
}
|
||||
@@ -111,7 +137,10 @@ class EdgeTtsProvider {
|
||||
return responseJson;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Preview TTS for a given voice ID.
|
||||
* @param {string} id Voice ID
|
||||
*/
|
||||
async previewTtsVoice(id) {
|
||||
this.audioElement.pause();
|
||||
this.audioElement.currentTime = 0;
|
||||
@@ -128,13 +157,18 @@ class EdgeTtsProvider {
|
||||
this.audioElement.play();
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch TTS generation from the API.
|
||||
* @param {string} inputText Text to generate TTS for
|
||||
* @param {string} voiceId Voice ID to use
|
||||
* @returns {Promise<Response>} Fetch response
|
||||
*/
|
||||
async fetchTtsGeneration(inputText, voiceId) {
|
||||
throwIfModuleMissing();
|
||||
await this.throwIfModuleMissing();
|
||||
|
||||
console.info(`Generating new TTS for voice_id ${voiceId}`);
|
||||
const url = new URL(getApiUrl());
|
||||
url.pathname = '/api/edge-tts/generate';
|
||||
const response = await doExtrasFetch(url,
|
||||
const url = this.getGenerateUrl();
|
||||
const response = await this.doFetch(url,
|
||||
{
|
||||
method: 'POST',
|
||||
headers: getRequestHeaders(),
|
||||
@@ -151,12 +185,85 @@ class EdgeTtsProvider {
|
||||
}
|
||||
return response;
|
||||
}
|
||||
}
|
||||
function throwIfModuleMissing() {
|
||||
if (!modules.includes('edge-tts')) {
|
||||
const message = 'Edge TTS module not loaded. Add edge-tts to enable-modules and restart the Extras API.';
|
||||
// toastr.error(message)
|
||||
throw new Error(message);
|
||||
|
||||
/**
|
||||
* Perform a fetch request using the configured provider.
|
||||
* @param {string} url URL string
|
||||
* @param {any} options Request options
|
||||
* @returns {Promise<Response>} Fetch response
|
||||
*/
|
||||
doFetch(url, options) {
|
||||
if (this.settings.provider === EDGE_TTS_PROVIDER.extras) {
|
||||
return doExtrasFetch(url, options);
|
||||
}
|
||||
|
||||
if (this.settings.provider === EDGE_TTS_PROVIDER.plugin) {
|
||||
return fetch(url, options);
|
||||
}
|
||||
|
||||
throw new Error('Invalid TTS Provider');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the URL for the TTS generation endpoint.
|
||||
* @returns {string} URL string
|
||||
*/
|
||||
getGenerateUrl() {
|
||||
if (this.settings.provider === EDGE_TTS_PROVIDER.extras) {
|
||||
const url = new URL(getApiUrl());
|
||||
url.pathname = '/api/edge-tts/generate';
|
||||
return url.toString();
|
||||
}
|
||||
|
||||
if (this.settings.provider === EDGE_TTS_PROVIDER.plugin) {
|
||||
return '/api/plugins/edge-tts/generate';
|
||||
}
|
||||
|
||||
throw new Error('Invalid TTS Provider');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the URL for the TTS voices endpoint.
|
||||
* @returns {string} URL object or string
|
||||
*/
|
||||
getVoicesUrl() {
|
||||
if (this.settings.provider === EDGE_TTS_PROVIDER.extras) {
|
||||
const url = new URL(getApiUrl());
|
||||
url.pathname = '/api/edge-tts/list';
|
||||
return url.toString();
|
||||
}
|
||||
|
||||
if (this.settings.provider === EDGE_TTS_PROVIDER.plugin) {
|
||||
return '/api/plugins/edge-tts/list';
|
||||
}
|
||||
|
||||
throw new Error('Invalid TTS Provider');
|
||||
}
|
||||
|
||||
async throwIfModuleMissing() {
|
||||
if (this.settings.provider === EDGE_TTS_PROVIDER.extras && !modules.includes('edge-tts')) {
|
||||
const message = 'Edge TTS module not loaded. Add edge-tts to enable-modules and restart the Extras API.';
|
||||
// toastr.error(message)
|
||||
throw new Error(message);
|
||||
}
|
||||
|
||||
if (this.settings.provider === EDGE_TTS_PROVIDER.plugin && !this.isPluginAvailable()) {
|
||||
const message = 'Edge TTS Server plugin not loaded. Install it from https://github.com/SillyTavern/SillyTavern-EdgeTTS-Plugin and restart the SillyTavern server.';
|
||||
// toastr.error(message)
|
||||
throw new Error(message);
|
||||
}
|
||||
}
|
||||
|
||||
async isPluginAvailable() {
|
||||
try {
|
||||
const result = await fetch('/api/plugins/edge-tts/probe', {
|
||||
method: 'POST',
|
||||
headers: getRequestHeaders(),
|
||||
});
|
||||
return result.ok;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -14,6 +14,8 @@ class ElevenLabsTtsProvider {
|
||||
defaultSettings = {
|
||||
stability: 0.75,
|
||||
similarity_boost: 0.75,
|
||||
style_exaggeration: 0.00,
|
||||
speaker_boost: true,
|
||||
apiKey: '',
|
||||
model: 'eleven_monolingual_v1',
|
||||
voiceMap: {},
|
||||
@@ -26,27 +28,57 @@ class ElevenLabsTtsProvider {
|
||||
<input id="elevenlabs_tts_api_key" type="text" class="text_pole" placeholder="<API Key>"/>
|
||||
<label for="elevenlabs_tts_model">Model</label>
|
||||
<select id="elevenlabs_tts_model" class="text_pole">
|
||||
<option value="eleven_monolingual_v1">Monolingual</option>
|
||||
<option value="eleven_monolingual_v1">English v1</option>
|
||||
<option value="eleven_multilingual_v1">Multilingual v1</option>
|
||||
<option value="eleven_multilingual_v2">Multilingual v2</option>
|
||||
<option value="eleven_turbo_v2">Turbo v2</option>
|
||||
</select>
|
||||
<input id="eleven_labs_connect" class="menu_button" type="button" value="Connect" />
|
||||
<label for="elevenlabs_tts_stability">Stability: <span id="elevenlabs_tts_stability_output"></span></label>
|
||||
<input id="elevenlabs_tts_stability" type="range" value="${this.defaultSettings.stability}" min="0" max="1" step="0.05" />
|
||||
<input id="elevenlabs_tts_stability" type="range" value="${this.defaultSettings.stability}" min="0" max="1" step="0.01" />
|
||||
<label for="elevenlabs_tts_similarity_boost">Similarity Boost: <span id="elevenlabs_tts_similarity_boost_output"></span></label>
|
||||
<input id="elevenlabs_tts_similarity_boost" type="range" value="${this.defaultSettings.similarity_boost}" min="0" max="1" step="0.05" />
|
||||
<input id="elevenlabs_tts_similarity_boost" type="range" value="${this.defaultSettings.similarity_boost}" min="0" max="1" step="0.01" />
|
||||
<div id="elevenlabs_tts_v2_options" style="display: none;">
|
||||
<label for="elevenlabs_tts_style_exaggeration">Style Exaggeration: <span id="elevenlabs_tts_style_exaggeration_output"></span></label>
|
||||
<input id="elevenlabs_tts_style_exaggeration" type="range" value="${this.defaultSettings.style_exaggeration}" min="0" max="1" step="0.01" />
|
||||
<label for="elevenlabs_tts_speaker_boost">Speaker Boost:</label>
|
||||
<input id="elevenlabs_tts_speaker_boost" style="display: inline-grid" type="checkbox" />
|
||||
</div>
|
||||
<hr>
|
||||
<div id="elevenlabs_tts_voice_cloning">
|
||||
<span>Instant Voice Cloning</span><br>
|
||||
<input id="elevenlabs_tts_voice_cloning_name" type="text" class="text_pole" placeholder="Voice Name"/>
|
||||
<input id="elevenlabs_tts_voice_cloning_description" type="text" class="text_pole" placeholder="Voice Description"/>
|
||||
<input id="elevenlabs_tts_voice_cloning_labels" type="text" class="text_pole" placeholder="Labels"/>
|
||||
<div class="menu_button menu_button_icon" id="upload_audio_file">
|
||||
<i class="fa-solid fa-file-import"></i>
|
||||
<span>Upload Audio Files</span>
|
||||
</div>
|
||||
<input id="elevenlabs_tts_audio_files" type="file" name="audio_files" accept="audio/*" style="display: none;" multiple>
|
||||
<div id="elevenlabs_tts_selected_files_list"></div>
|
||||
<input id="elevenlabs_tts_clone_voice_button" class="menu_button menu_button_icon" type="button" value="Clone Voice">
|
||||
</div>
|
||||
<hr>
|
||||
</div>
|
||||
`;
|
||||
return html;
|
||||
}
|
||||
|
||||
shouldInvolveExtendedSettings() {
|
||||
return this.settings.model === 'eleven_multilingual_v2';
|
||||
}
|
||||
|
||||
onSettingsChange() {
|
||||
// Update dynamically
|
||||
this.settings.stability = $('#elevenlabs_tts_stability').val();
|
||||
this.settings.similarity_boost = $('#elevenlabs_tts_similarity_boost').val();
|
||||
this.settings.style_exaggeration = $('#elevenlabs_tts_style_exaggeration').val();
|
||||
this.settings.speaker_boost = $('#elevenlabs_tts_speaker_boost').is(':checked');
|
||||
this.settings.model = $('#elevenlabs_tts_model').find(':selected').val();
|
||||
$('#elevenlabs_tts_stability_output').text(this.settings.stability);
|
||||
$('#elevenlabs_tts_similarity_boost_output').text(this.settings.similarity_boost);
|
||||
$('#elevenlabs_tts_stability_output').text(this.settings.stability * 100 + '%');
|
||||
$('#elevenlabs_tts_similarity_boost_output').text(this.settings.similarity_boost * 100 + '%');
|
||||
$('#elevenlabs_tts_style_exaggeration_output').text(this.settings.style_exaggeration * 100 + '%');
|
||||
$('#elevenlabs_tts_v2_options').toggle(this.shouldInvolveExtendedSettings());
|
||||
saveTtsProviderSettings();
|
||||
}
|
||||
|
||||
@@ -75,21 +107,28 @@ class ElevenLabsTtsProvider {
|
||||
|
||||
$('#elevenlabs_tts_stability').val(this.settings.stability);
|
||||
$('#elevenlabs_tts_similarity_boost').val(this.settings.similarity_boost);
|
||||
$('#elevenlabs_tts_style_exaggeration').val(this.settings.style_exaggeration);
|
||||
$('#elevenlabs_tts_speaker_boost').prop('checked', this.settings.speaker_boost);
|
||||
$('#elevenlabs_tts_api_key').val(this.settings.apiKey);
|
||||
$('#elevenlabs_tts_model').val(this.settings.model);
|
||||
$('#eleven_labs_connect').on('click', () => { this.onConnectClick(); });
|
||||
$('#elevenlabs_tts_similarity_boost').on('input', this.onSettingsChange.bind(this));
|
||||
$('#elevenlabs_tts_stability').on('input', this.onSettingsChange.bind(this));
|
||||
$('#elevenlabs_tts_style_exaggeration').on('input', this.onSettingsChange.bind(this));
|
||||
$('#elevenlabs_tts_speaker_boost').on('change', this.onSettingsChange.bind(this));
|
||||
$('#elevenlabs_tts_model').on('change', this.onSettingsChange.bind(this));
|
||||
$('#elevenlabs_tts_stability_output').text(this.settings.stability);
|
||||
$('#elevenlabs_tts_similarity_boost_output').text(this.settings.similarity_boost);
|
||||
|
||||
$('#elevenlabs_tts_style_exaggeration_output').text(this.settings.style_exaggeration);
|
||||
$('#elevenlabs_tts_v2_options').toggle(this.shouldInvolveExtendedSettings());
|
||||
try {
|
||||
await this.checkReady();
|
||||
console.debug('ElevenLabs: Settings loaded');
|
||||
} catch {
|
||||
console.debug('ElevenLabs: Settings loaded, but not ready');
|
||||
}
|
||||
|
||||
this.setupVoiceCloningMenu();
|
||||
}
|
||||
|
||||
// Perform a simple readiness check by trying to fetch voiceIds
|
||||
@@ -107,6 +146,63 @@ class ElevenLabsTtsProvider {
|
||||
});
|
||||
}
|
||||
|
||||
setupVoiceCloningMenu() {
|
||||
const audioFilesInput = document.getElementById('elevenlabs_tts_audio_files');
|
||||
const selectedFilesListElement = document.getElementById('elevenlabs_tts_selected_files_list');
|
||||
const cloneVoiceButton = document.getElementById('elevenlabs_tts_clone_voice_button');
|
||||
const uploadAudioFileButton = document.getElementById('upload_audio_file');
|
||||
const voiceCloningNameInput = document.getElementById('elevenlabs_tts_voice_cloning_name');
|
||||
const voiceCloningDescriptionInput = document.getElementById('elevenlabs_tts_voice_cloning_description');
|
||||
const voiceCloningLabelsInput = document.getElementById('elevenlabs_tts_voice_cloning_labels');
|
||||
|
||||
const updateCloneVoiceButtonVisibility = () => {
|
||||
cloneVoiceButton.style.display = audioFilesInput.files.length > 0 ? 'inline-block' : 'none';
|
||||
};
|
||||
|
||||
const clearSelectedFiles = () => {
|
||||
audioFilesInput.value = '';
|
||||
selectedFilesListElement.innerHTML = '';
|
||||
updateCloneVoiceButtonVisibility();
|
||||
};
|
||||
|
||||
uploadAudioFileButton.addEventListener('click', () => {
|
||||
audioFilesInput.click();
|
||||
});
|
||||
|
||||
audioFilesInput.addEventListener('change', () => {
|
||||
selectedFilesListElement.innerHTML = '';
|
||||
for (const file of audioFilesInput.files) {
|
||||
const listItem = document.createElement('div');
|
||||
listItem.textContent = file.name;
|
||||
selectedFilesListElement.appendChild(listItem);
|
||||
}
|
||||
updateCloneVoiceButtonVisibility();
|
||||
});
|
||||
|
||||
cloneVoiceButton.addEventListener('click', async () => {
|
||||
const voiceName = voiceCloningNameInput.value.trim();
|
||||
const voiceDescription = voiceCloningDescriptionInput.value.trim();
|
||||
const voiceLabels = voiceCloningLabelsInput.value.trim();
|
||||
|
||||
if (!voiceName) {
|
||||
toastr.error('Please provide a name for the cloned voice.');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.addVoice(voiceName, voiceDescription, voiceLabels);
|
||||
toastr.success('Voice cloned successfully. Hit reload to see the new voice in the voice listing.');
|
||||
clearSelectedFiles();
|
||||
voiceCloningNameInput.value = '';
|
||||
voiceCloningDescriptionInput.value = '';
|
||||
voiceCloningLabelsInput.value = '';
|
||||
} catch (error) {
|
||||
toastr.error(`Failed to clone voice: ${error.message}`);
|
||||
}
|
||||
});
|
||||
|
||||
updateCloneVoiceButtonVisibility();
|
||||
}
|
||||
|
||||
async updateApiKey() {
|
||||
// Using this call to validate API key
|
||||
@@ -206,24 +302,26 @@ class ElevenLabsTtsProvider {
|
||||
async fetchTtsGeneration(text, voiceId) {
|
||||
let model = this.settings.model ?? 'eleven_monolingual_v1';
|
||||
console.info(`Generating new TTS for voice_id ${voiceId}, model ${model}`);
|
||||
const response = await fetch(
|
||||
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`,
|
||||
{
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'xi-api-key': this.settings.apiKey,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model_id: model,
|
||||
text: text,
|
||||
voice_settings: {
|
||||
stability: Number(this.settings.stability),
|
||||
similarity_boost: Number(this.settings.similarity_boost),
|
||||
},
|
||||
}),
|
||||
const request = {
|
||||
model_id: model,
|
||||
text: text,
|
||||
voice_settings: {
|
||||
stability: Number(this.settings.stability),
|
||||
similarity_boost: Number(this.settings.similarity_boost),
|
||||
},
|
||||
);
|
||||
};
|
||||
if (this.shouldInvolveExtendedSettings()) {
|
||||
request.voice_settings.style_exaggeration = Number(this.settings.style_exaggeration);
|
||||
request.voice_settings.speaker_boost = Boolean(this.settings.speaker_boost);
|
||||
}
|
||||
const response = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'xi-api-key': this.settings.apiKey,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify(request),
|
||||
});
|
||||
if (!response.ok) {
|
||||
toastr.error(response.statusText, 'TTS Generation Failed');
|
||||
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
|
||||
@@ -260,4 +358,33 @@ class ElevenLabsTtsProvider {
|
||||
const responseJson = await response.json();
|
||||
return responseJson.history;
|
||||
}
|
||||
|
||||
async addVoice(name, description, labels) {
|
||||
const selected_files = document.querySelectorAll('input[type="file"][name="audio_files"]');
|
||||
const formData = new FormData();
|
||||
|
||||
formData.append('name', name);
|
||||
formData.append('description', description);
|
||||
formData.append('labels', labels);
|
||||
|
||||
for (const file of selected_files) {
|
||||
if (file.files.length > 0) {
|
||||
formData.append('files', file.files[0]);
|
||||
}
|
||||
}
|
||||
|
||||
const response = await fetch('https://api.elevenlabs.io/v1/voices/add', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'xi-api-key': this.settings.apiKey,
|
||||
},
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
|
||||
}
|
||||
|
||||
return await response.json();
|
||||
}
|
||||
}
|
||||
|
@@ -1,4 +1,4 @@
|
||||
import { callPopup, cancelTtsPlay, eventSource, event_types, name2, saveSettingsDebounced } from '../../../script.js';
|
||||
import { callPopup, cancelTtsPlay, eventSource, event_types, name2, saveSettingsDebounced, substituteParams } from '../../../script.js';
|
||||
import { ModuleWorkerWrapper, doExtrasFetch, extension_settings, getApiUrl, getContext, modules } from '../../extensions.js';
|
||||
import { delay, escapeRegex, getBase64Async, getStringHash, onlyUnique } from '../../utils.js';
|
||||
import { EdgeTtsProvider } from './edge.js';
|
||||
@@ -8,20 +8,23 @@ import { CoquiTtsProvider } from './coqui.js';
|
||||
import { SystemTtsProvider } from './system.js';
|
||||
import { NovelTtsProvider } from './novel.js';
|
||||
import { power_user } from '../../power-user.js';
|
||||
import { registerSlashCommand } from '../../slash-commands.js';
|
||||
import { OpenAITtsProvider } from './openai.js';
|
||||
import { XTTSTtsProvider } from './xtts.js';
|
||||
import { GSVITtsProvider } from './gsvi.js';
|
||||
import { AllTalkTtsProvider } from './alltalk.js';
|
||||
import { SpeechT5TtsProvider } from './speecht5.js';
|
||||
import { SlashCommandParser } from '../../slash-commands/SlashCommandParser.js';
|
||||
import { SlashCommand } from '../../slash-commands/SlashCommand.js';
|
||||
import { ARGUMENT_TYPE, SlashCommandArgument, SlashCommandNamedArgument } from '../../slash-commands/SlashCommandArgument.js';
|
||||
export { talkingAnimation };
|
||||
|
||||
const UPDATE_INTERVAL = 1000;
|
||||
|
||||
let voiceMapEntries = [];
|
||||
let voiceMap = {}; // {charName:voiceid, charName2:voiceid2}
|
||||
let storedvalue = false;
|
||||
let talkingHeadState = false;
|
||||
let lastChatId = null;
|
||||
let lastMessage = null;
|
||||
let lastMessageHash = null;
|
||||
|
||||
const DEFAULT_VOICE_MARKER = '[Default Voice]';
|
||||
@@ -68,7 +71,7 @@ export function getPreviewString(lang) {
|
||||
return previewStrings[lang] ?? fallbackPreview;
|
||||
}
|
||||
|
||||
let ttsProviders = {
|
||||
const ttsProviders = {
|
||||
ElevenLabs: ElevenLabsTtsProvider,
|
||||
Silero: SileroTtsProvider,
|
||||
XTTSv2: XTTSTtsProvider,
|
||||
@@ -84,7 +87,6 @@ let ttsProviders = {
|
||||
let ttsProvider;
|
||||
let ttsProviderName;
|
||||
|
||||
let ttsLastMessage = null;
|
||||
|
||||
async function onNarrateOneMessage() {
|
||||
audioElement.src = '/sounds/silence.mp3';
|
||||
@@ -132,103 +134,13 @@ async function onNarrateText(args, text) {
|
||||
}
|
||||
|
||||
async function moduleWorker() {
|
||||
// Primarily determining when to add new chat to the TTS queue
|
||||
const enabled = $('#tts_enabled').is(':checked');
|
||||
$('body').toggleClass('tts', enabled);
|
||||
if (!enabled) {
|
||||
if (!extension_settings.tts.enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
const context = getContext();
|
||||
const chat = context.chat;
|
||||
|
||||
processTtsQueue();
|
||||
processAudioJobQueue();
|
||||
updateUiAudioPlayState();
|
||||
|
||||
// Auto generation is disabled
|
||||
if (extension_settings.tts.auto_generation == false) {
|
||||
return;
|
||||
}
|
||||
|
||||
// no characters or group selected
|
||||
if (!context.groupId && context.characterId === undefined) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Chat changed
|
||||
if (
|
||||
context.chatId !== lastChatId
|
||||
) {
|
||||
currentMessageNumber = context.chat.length ? context.chat.length : 0;
|
||||
saveLastValues();
|
||||
|
||||
// Force to speak on the first message in the new chat
|
||||
if (context.chat.length === 1) {
|
||||
lastMessageHash = -1;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// take the count of messages
|
||||
let lastMessageNumber = context.chat.length ? context.chat.length : 0;
|
||||
|
||||
// There's no new messages
|
||||
let diff = lastMessageNumber - currentMessageNumber;
|
||||
let hashNew = getStringHash((chat.length && chat[chat.length - 1].mes) ?? '');
|
||||
|
||||
// if messages got deleted, diff will be < 0
|
||||
if (diff < 0) {
|
||||
// necessary actions will be taken by the onChatDeleted() handler
|
||||
return;
|
||||
}
|
||||
|
||||
// if no new messages, or same message, or same message hash, do nothing
|
||||
if (diff == 0 && hashNew === lastMessageHash) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If streaming, wait for streaming to finish before processing new messages
|
||||
if (context.streamingProcessor && !context.streamingProcessor.isFinished) {
|
||||
return;
|
||||
}
|
||||
|
||||
// clone message object, as things go haywire if message object is altered below (it's passed by reference)
|
||||
const message = structuredClone(chat[chat.length - 1]);
|
||||
|
||||
// if last message within current message, message got extended. only send diff to TTS.
|
||||
if (ttsLastMessage !== null && message.mes.indexOf(ttsLastMessage) !== -1) {
|
||||
let tmp = message.mes;
|
||||
message.mes = message.mes.replace(ttsLastMessage, '');
|
||||
ttsLastMessage = tmp;
|
||||
} else {
|
||||
ttsLastMessage = message.mes;
|
||||
}
|
||||
|
||||
// We're currently swiping. Don't generate voice
|
||||
if (!message || message.mes === '...' || message.mes === '') {
|
||||
return;
|
||||
}
|
||||
|
||||
// Don't generate if message doesn't have a display text
|
||||
if (extension_settings.tts.narrate_translated_only && !(message?.extra?.display_text)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Don't generate if message is a user message and user message narration is disabled
|
||||
if (message.is_user && !extension_settings.tts.narrate_user) {
|
||||
return;
|
||||
}
|
||||
|
||||
// New messages, add new chat to history
|
||||
lastMessageHash = hashNew;
|
||||
currentMessageNumber = lastMessageNumber;
|
||||
|
||||
console.debug(
|
||||
`Adding message from ${message.name} for TTS processing: "${message.mes}"`,
|
||||
);
|
||||
ttsJobQueue.push(message);
|
||||
}
|
||||
|
||||
function talkingAnimation(switchValue) {
|
||||
@@ -240,11 +152,11 @@ function talkingAnimation(switchValue) {
|
||||
const apiUrl = getApiUrl();
|
||||
const animationType = switchValue ? 'start' : 'stop';
|
||||
|
||||
if (switchValue !== storedvalue) {
|
||||
if (switchValue !== talkingHeadState) {
|
||||
try {
|
||||
console.log(animationType + ' Talking Animation');
|
||||
doExtrasFetch(`${apiUrl}/api/talkinghead/${animationType}_talking`);
|
||||
storedvalue = switchValue; // Update the storedvalue to the current switchValue
|
||||
talkingHeadState = switchValue;
|
||||
} catch (error) {
|
||||
// Handle the error here or simply ignore it to prevent logging
|
||||
}
|
||||
@@ -291,7 +203,6 @@ function debugTtsPlayback() {
|
||||
{
|
||||
'ttsProviderName': ttsProviderName,
|
||||
'voiceMap': voiceMap,
|
||||
'currentMessageNumber': currentMessageNumber,
|
||||
'audioPaused': audioPaused,
|
||||
'audioJobQueue': audioJobQueue,
|
||||
'currentAudioJob': currentAudioJob,
|
||||
@@ -352,6 +263,7 @@ async function playAudioData(audioJob) {
|
||||
audioElement.addEventListener('ended', completeCurrentAudioJob);
|
||||
audioElement.addEventListener('canplay', () => {
|
||||
console.debug('Starting TTS playback');
|
||||
audioElement.playbackRate = extension_settings.tts.playback_rate;
|
||||
audioElement.play();
|
||||
});
|
||||
}
|
||||
@@ -467,6 +379,7 @@ async function processAudioJobQueue() {
|
||||
playAudioData(currentAudioJob);
|
||||
talkingAnimation(true);
|
||||
} catch (error) {
|
||||
toastr.error(error.toString());
|
||||
console.error(error);
|
||||
audioQueueProcessorReady = true;
|
||||
}
|
||||
@@ -478,21 +391,12 @@ async function processAudioJobQueue() {
|
||||
|
||||
let ttsJobQueue = [];
|
||||
let currentTtsJob; // Null if nothing is currently being processed
|
||||
let currentMessageNumber = 0;
|
||||
|
||||
function completeTtsJob() {
|
||||
console.info(`Current TTS job for ${currentTtsJob?.name} completed.`);
|
||||
currentTtsJob = null;
|
||||
}
|
||||
|
||||
function saveLastValues() {
|
||||
const context = getContext();
|
||||
lastChatId = context.chatId;
|
||||
lastMessageHash = getStringHash(
|
||||
(context.chat.length && context.chat[context.chat.length - 1].mes) ?? '',
|
||||
);
|
||||
}
|
||||
|
||||
async function tts(text, voiceId, char) {
|
||||
async function processResponse(response) {
|
||||
// RVC injection
|
||||
@@ -526,11 +430,18 @@ async function processTtsQueue() {
|
||||
currentTtsJob = ttsJobQueue.shift();
|
||||
let text = extension_settings.tts.narrate_translated_only ? (currentTtsJob?.extra?.display_text || currentTtsJob.mes) : currentTtsJob.mes;
|
||||
|
||||
// Substitute macros
|
||||
text = substituteParams(text);
|
||||
|
||||
if (extension_settings.tts.skip_codeblocks) {
|
||||
text = text.replace(/^\s{4}.*$/gm, '').trim();
|
||||
text = text.replace(/```.*?```/gs, '').trim();
|
||||
}
|
||||
|
||||
if (extension_settings.tts.skip_tags) {
|
||||
text = text.replace(/<.*?>.*?<\/.*?>/g, '').trim();
|
||||
}
|
||||
|
||||
if (!extension_settings.tts.pass_asterisks) {
|
||||
text = extension_settings.tts.narrate_dialogues_only
|
||||
? text.replace(/\*[^*]*?(\*|$)/g, '').trim() // remove asterisks content
|
||||
@@ -579,8 +490,9 @@ async function processTtsQueue() {
|
||||
toastr.error(`Specified voice for ${char} was not found. Check the TTS extension settings.`);
|
||||
throw `Unable to attain voiceId for ${char}`;
|
||||
}
|
||||
tts(text, voiceId, char);
|
||||
await tts(text, voiceId, char);
|
||||
} catch (error) {
|
||||
toastr.error(error.toString());
|
||||
console.error(error);
|
||||
currentTtsJob = null;
|
||||
}
|
||||
@@ -618,6 +530,12 @@ function loadSettings() {
|
||||
$('#tts_narrate_translated_only').prop('checked', extension_settings.tts.narrate_translated_only);
|
||||
$('#tts_narrate_user').prop('checked', extension_settings.tts.narrate_user);
|
||||
$('#tts_pass_asterisks').prop('checked', extension_settings.tts.pass_asterisks);
|
||||
$('#tts_skip_codeblocks').prop('checked', extension_settings.tts.skip_codeblocks);
|
||||
$('#tts_skip_tags').prop('checked', extension_settings.tts.skip_tags);
|
||||
$('#playback_rate').val(extension_settings.tts.playback_rate);
|
||||
$('#playback_rate_counter').val(Number(extension_settings.tts.playback_rate).toFixed(2));
|
||||
$('#playback_rate_block').toggle(extension_settings.tts.currentProvider !== 'System');
|
||||
|
||||
$('body').toggleClass('tts', extension_settings.tts.enabled);
|
||||
}
|
||||
|
||||
@@ -627,6 +545,7 @@ const defaultSettings = {
|
||||
currentProvider: 'ElevenLabs',
|
||||
auto_generation: true,
|
||||
narrate_user: false,
|
||||
playback_rate: 1,
|
||||
};
|
||||
|
||||
function setTtsStatus(status, success) {
|
||||
@@ -650,6 +569,7 @@ function onRefreshClick() {
|
||||
initVoiceMap();
|
||||
updateVoiceMap();
|
||||
}).catch(error => {
|
||||
toastr.error(error.toString());
|
||||
console.error(error);
|
||||
setTtsStatus(error, false);
|
||||
});
|
||||
@@ -696,6 +616,11 @@ function onSkipCodeblocksClick() {
|
||||
saveSettingsDebounced();
|
||||
}
|
||||
|
||||
function onSkipTagsClick() {
|
||||
extension_settings.tts.skip_tags = !!$('#tts_skip_tags').prop('checked');
|
||||
saveSettingsDebounced();
|
||||
}
|
||||
|
||||
function onPassAsterisksClick() {
|
||||
extension_settings.tts.pass_asterisks = !!$('#tts_pass_asterisks').prop('checked');
|
||||
saveSettingsDebounced();
|
||||
@@ -732,6 +657,7 @@ async function loadTtsProvider(provider) {
|
||||
function onTtsProviderChange() {
|
||||
const ttsProviderSelection = $('#tts_provider').val();
|
||||
extension_settings.tts.currentProvider = ttsProviderSelection;
|
||||
$('#playback_rate_block').toggle(extension_settings.tts.currentProvider !== 'System');
|
||||
loadTtsProvider(ttsProviderSelection);
|
||||
}
|
||||
|
||||
@@ -752,26 +678,103 @@ async function onChatChanged() {
|
||||
await resetTtsPlayback();
|
||||
const voiceMapInit = initVoiceMap();
|
||||
await Promise.race([voiceMapInit, delay(1000)]);
|
||||
ttsLastMessage = null;
|
||||
lastMessage = null;
|
||||
}
|
||||
|
||||
async function onChatDeleted() {
|
||||
async function onMessageEvent(messageId) {
|
||||
// If TTS is disabled, do nothing
|
||||
if (!extension_settings.tts.enabled) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Auto generation is disabled
|
||||
if (!extension_settings.tts.auto_generation) {
|
||||
return;
|
||||
}
|
||||
|
||||
const context = getContext();
|
||||
|
||||
// no characters or group selected
|
||||
if (!context.groupId && context.characterId === undefined) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Chat changed
|
||||
if (context.chatId !== lastChatId) {
|
||||
lastChatId = context.chatId;
|
||||
lastMessageHash = getStringHash(context.chat[messageId]?.mes ?? '');
|
||||
|
||||
// Force to speak on the first message in the new chat
|
||||
if (context.chat.length === 1) {
|
||||
lastMessageHash = -1;
|
||||
}
|
||||
}
|
||||
|
||||
// clone message object, as things go haywire if message object is altered below (it's passed by reference)
|
||||
const message = structuredClone(context.chat[messageId]);
|
||||
const hashNew = getStringHash(message?.mes ?? '');
|
||||
|
||||
// if no new messages, or same message, or same message hash, do nothing
|
||||
if (hashNew === lastMessageHash) {
|
||||
return;
|
||||
}
|
||||
|
||||
const isLastMessageInCurrent = () =>
|
||||
lastMessage &&
|
||||
typeof lastMessage === 'object' &&
|
||||
message.swipe_id === lastMessage.swipe_id &&
|
||||
message.name === lastMessage.name &&
|
||||
message.is_user === lastMessage.is_user &&
|
||||
message.mes.indexOf(lastMessage.mes) !== -1;
|
||||
|
||||
// if last message within current message, message got extended. only send diff to TTS.
|
||||
if (isLastMessageInCurrent()) {
|
||||
const tmp = structuredClone(message);
|
||||
message.mes = message.mes.replace(lastMessage.mes, '');
|
||||
lastMessage = tmp;
|
||||
} else {
|
||||
lastMessage = structuredClone(message);
|
||||
}
|
||||
|
||||
// We're currently swiping. Don't generate voice
|
||||
if (!message || message.mes === '...' || message.mes === '') {
|
||||
return;
|
||||
}
|
||||
|
||||
// Don't generate if message doesn't have a display text
|
||||
if (extension_settings.tts.narrate_translated_only && !(message?.extra?.display_text)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Don't generate if message is a user message and user message narration is disabled
|
||||
if (message.is_user && !extension_settings.tts.narrate_user) {
|
||||
return;
|
||||
}
|
||||
|
||||
// New messages, add new chat to history
|
||||
lastMessageHash = hashNew;
|
||||
lastChatId = context.chatId;
|
||||
|
||||
console.debug(`Adding message from ${message.name} for TTS processing: "${message.mes}"`);
|
||||
ttsJobQueue.push(message);
|
||||
}
|
||||
|
||||
async function onMessageDeleted() {
|
||||
const context = getContext();
|
||||
|
||||
// update internal references to new last message
|
||||
lastChatId = context.chatId;
|
||||
currentMessageNumber = context.chat.length ? context.chat.length : 0;
|
||||
|
||||
// compare against lastMessageHash. If it's the same, we did not delete the last chat item, so no need to reset tts queue
|
||||
let messageHash = getStringHash((context.chat.length && context.chat[context.chat.length - 1].mes) ?? '');
|
||||
const messageHash = getStringHash((context.chat.length && context.chat[context.chat.length - 1].mes) ?? '');
|
||||
if (messageHash === lastMessageHash) {
|
||||
return;
|
||||
}
|
||||
lastMessageHash = messageHash;
|
||||
ttsLastMessage = (context.chat.length && context.chat[context.chat.length - 1].mes) ?? '';
|
||||
lastMessage = context.chat.length ? structuredClone(context.chat[context.chat.length - 1]) : null;
|
||||
|
||||
// stop any tts playback since message might not exist anymore
|
||||
await resetTtsPlayback();
|
||||
resetTtsPlayback();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1019,11 +1022,29 @@ $(document).ready(function () {
|
||||
<input type="checkbox" id="tts_skip_codeblocks">
|
||||
<small>Skip codeblocks</small>
|
||||
</label>
|
||||
<label class="checkbox_label" for="tts_skip_tags">
|
||||
<input type="checkbox" id="tts_skip_tags">
|
||||
<small>Skip <tagged> blocks</small>
|
||||
</label>
|
||||
<label class="checkbox_label" for="tts_pass_asterisks">
|
||||
<input type="checkbox" id="tts_pass_asterisks">
|
||||
<small>Pass Asterisks to TTS Engine</small>
|
||||
</label>
|
||||
</div>
|
||||
<div id="playback_rate_block" class="range-block">
|
||||
<hr>
|
||||
<div class="range-block-title justifyLeft" data-i18n="Audio Playback Speed">
|
||||
<small>Audio Playback Speed</small>
|
||||
</div>
|
||||
<div class="range-block-range-and-counter">
|
||||
<div class="range-block-range">
|
||||
<input type="range" id="playback_rate" name="volume" min="0" max="3" step="0.05">
|
||||
</div>
|
||||
<div class="range-block-counter">
|
||||
<input type="number" min="0" max="3" step="0.05" data-for="playback_rate" id="playback_rate_counter">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div id="tts_voicemap_block">
|
||||
</div>
|
||||
<hr>
|
||||
@@ -1044,9 +1065,19 @@ $(document).ready(function () {
|
||||
$('#tts_narrate_quoted').on('click', onNarrateQuotedClick);
|
||||
$('#tts_narrate_translated_only').on('click', onNarrateTranslatedOnlyClick);
|
||||
$('#tts_skip_codeblocks').on('click', onSkipCodeblocksClick);
|
||||
$('#tts_skip_tags').on('click', onSkipTagsClick);
|
||||
$('#tts_pass_asterisks').on('click', onPassAsterisksClick);
|
||||
$('#tts_auto_generation').on('click', onAutoGenerationClick);
|
||||
$('#tts_narrate_user').on('click', onNarrateUserClick);
|
||||
|
||||
$('#playback_rate').on('input', function () {
|
||||
const value = $(this).val();
|
||||
const formattedValue = Number(value).toFixed(2);
|
||||
extension_settings.tts.playback_rate = value;
|
||||
$('#playback_rate_counter').val(formattedValue);
|
||||
saveSettingsDebounced();
|
||||
});
|
||||
|
||||
$('#tts_voices').on('click', onTtsVoicesClick);
|
||||
for (const provider in ttsProviders) {
|
||||
$('#tts_provider').append($('<option />').val(provider).text(provider));
|
||||
@@ -1062,8 +1093,40 @@ $(document).ready(function () {
|
||||
setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things
|
||||
eventSource.on(event_types.MESSAGE_SWIPED, resetTtsPlayback);
|
||||
eventSource.on(event_types.CHAT_CHANGED, onChatChanged);
|
||||
eventSource.on(event_types.MESSAGE_DELETED, onChatDeleted);
|
||||
eventSource.on(event_types.MESSAGE_DELETED, onMessageDeleted);
|
||||
eventSource.on(event_types.GROUP_UPDATED, onChatChanged);
|
||||
registerSlashCommand('speak', onNarrateText, ['narrate', 'tts'], '<span class="monospace">(text)</span> – narrate any text using currently selected character\'s voice. Use voice="Character Name" argument to set other voice from the voice map, example: <tt>/speak voice="Donald Duck" Quack!</tt>', true, true);
|
||||
eventSource.makeLast(event_types.CHARACTER_MESSAGE_RENDERED, onMessageEvent);
|
||||
eventSource.makeLast(event_types.USER_MESSAGE_RENDERED, onMessageEvent);
|
||||
SlashCommandParser.addCommandObject(SlashCommand.fromProps({ name: 'speak',
|
||||
callback: onNarrateText,
|
||||
aliases: ['narrate', 'tts'],
|
||||
namedArgumentList: [
|
||||
new SlashCommandNamedArgument(
|
||||
'voice', 'character voice name', [ARGUMENT_TYPE.STRING], false,
|
||||
),
|
||||
],
|
||||
unnamedArgumentList: [
|
||||
new SlashCommandArgument(
|
||||
'text', [ARGUMENT_TYPE.STRING], true,
|
||||
),
|
||||
],
|
||||
helpString: `
|
||||
<div>
|
||||
Narrate any text using currently selected character's voice.
|
||||
</div>
|
||||
<div>
|
||||
Use <code>voice="Character Name"</code> argument to set other voice from the voice map.
|
||||
</div>
|
||||
<div>
|
||||
<strong>Example:</strong>
|
||||
<ul>
|
||||
<li>
|
||||
<pre><code>/speak voice="Donald Duck" Quack!</code></pre>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
`,
|
||||
}));
|
||||
|
||||
document.body.appendChild(audioElement);
|
||||
});
|
||||
|
@@ -28,6 +28,8 @@ class NovelTtsProvider {
|
||||
processText(text) {
|
||||
// Novel reads tilde as a word. Replace with full stop
|
||||
text = text.replace(/~/g, '.');
|
||||
// Novel reads asterisk as a word. Remove it
|
||||
text = text.replace(/\*/g, '');
|
||||
return text;
|
||||
}
|
||||
|
||||
|
@@ -11,7 +11,7 @@ class SileroTtsProvider {
|
||||
settings;
|
||||
ready = false;
|
||||
voices = [];
|
||||
separator = ' .. ';
|
||||
separator = ' ';
|
||||
|
||||
defaultSettings = {
|
||||
provider_endpoint: 'http://localhost:8001/tts',
|
||||
|
Reference in New Issue
Block a user