Merge branch 'staging' of http://github.com/cohee1207/SillyTavern into staging
This commit is contained in:
commit
87b9da11c6
|
@ -73,6 +73,7 @@ const extension_settings = {
|
|||
fluctuation: 0.1,
|
||||
enabled: false,
|
||||
},
|
||||
speech_recognition: {},
|
||||
};
|
||||
|
||||
let modules = [];
|
||||
|
|
|
@ -372,8 +372,6 @@ function onEditPromptClick() {
|
|||
let popupText = ''
|
||||
popupText += `
|
||||
<div class="objective_prompt_modal">
|
||||
<div class="alignitemsflexstart flex-container">
|
||||
</div>
|
||||
<div>
|
||||
<label for="objective-prompt-generate">Generation Prompt</label>
|
||||
<textarea id="objective-prompt-generate" type="text" class="text_pole textarea_compact" rows="8"></textarea>
|
||||
|
@ -382,12 +380,14 @@ function onEditPromptClick() {
|
|||
<label for="objective-prompt-extension-prompt">Injected Prompt</label>
|
||||
<textarea id="objective-prompt-extension-prompt" type="text" class="text_pole textarea_compact" rows="8"></textarea>
|
||||
</div>
|
||||
<div class="alignitemsflexstart flex-container">
|
||||
<input id="objective-custom-prompt-name" type="text" class="flex1 heightFitContent text_pole widthNatural" maxlength="250" placeholder="Custom Prompt Name">
|
||||
<input id="objective-custom-prompt-save" class="menu_button" type="submit" value="Save Custom Prompt" />
|
||||
<label for="objective-prompt-load"> Load Prompt </label>
|
||||
<div class="objective_prompt_block">
|
||||
<input id="objective-custom-prompt-name" style="flex-grow:2" type="text" class="flex1 heightFitContent text_pole widthNatural" maxlength="250" placeholder="Custom Prompt Name">
|
||||
<input id="objective-custom-prompt-save" style="flex-grow:1" class="menu_button" type="submit" value="Save Prompt" />
|
||||
</div>
|
||||
<div class="objective_prompt_block">
|
||||
<label for="objective-prompt-load">Load Prompt</label>
|
||||
<select id="objective-prompt-load"><select>
|
||||
<input id="objective-custom-prompt-delete" class="menu_button" type="submit" value="Delete Custom Prompt" />
|
||||
<input id="objective-custom-prompt-delete" class="menu_button" type="submit" value="Delete Prompt" />
|
||||
</div>
|
||||
</div>`
|
||||
callPopup(popupText, 'text')
|
||||
|
|
|
@ -10,6 +10,13 @@
|
|||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.objective_prompt_block {
|
||||
display: flex;
|
||||
align-items: baseline;
|
||||
column-gap: 5px;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.objective_block_control {
|
||||
align-items: baseline;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,233 @@
|
|||
// Borrowed from Agnai (AGPLv3)
|
||||
// https://github.com/agnaistic/agnai/blob/dev/web/pages/Chat/components/SpeechRecognitionRecorder.tsx
|
||||
// First version by Cohee#1207
|
||||
// Adapted by Tony-sama
|
||||
|
||||
export { BrowserSttProvider }
|
||||
|
||||
const DEBUG_PREFIX = "<Speech Recognition module (Browser)> "
|
||||
|
||||
class BrowserSttProvider {
|
||||
//########//
|
||||
// Config //
|
||||
//########//
|
||||
|
||||
settings = {
|
||||
language: ""
|
||||
}
|
||||
|
||||
defaultSettings = {
|
||||
language: "en-US",
|
||||
}
|
||||
|
||||
processTranscriptFunction = null;
|
||||
|
||||
get settingsHtml() {
|
||||
let html = ' \
|
||||
<span>Language</span> </br> \
|
||||
<select id="speech_recognition_browser_provider_language"> \
|
||||
<option value="ar-SA">ar-SA: Arabic (Saudi Arabia)</option> \
|
||||
<option value="bn-BD">bn-BD: Bangla (Bangladesh)</option> \
|
||||
<option value="bn-IN">bn-IN: Bangla (India)</option> \
|
||||
<option value="cs-CZ">cs-CZ: Czech (Czech Republic)</option> \
|
||||
<option value="da-DK">da-DK: Danish (Denmark)</option> \
|
||||
<option value="de-AT">de-AT: German (Austria)</option> \
|
||||
<option value="de-CH">de-CH: German (Switzerland)</option> \
|
||||
<option value="de-DE">de-DE: German (Germany)</option> \
|
||||
<option value="el-GR">el-GR: Greek (Greece)</option> \
|
||||
<option value="en-AU">en-AU: English (Australia)</option> \
|
||||
<option value="en-CA">en-CA: English (Canada)</option> \
|
||||
<option value="en-GB">en-GB: English (United Kingdom)</option> \
|
||||
<option value="en-IE">en-IE: English (Ireland)</option> \
|
||||
<option value="en-IN">en-IN: English (India)</option> \
|
||||
<option value="en-NZ">en-NZ: English (New Zealand)</option> \
|
||||
<option value="en-US">en-US: English (United States)</option> \
|
||||
<option value="en-ZA">en-ZA: English (South Africa)</option> \
|
||||
<option value="es-AR">es-AR: Spanish (Argentina)</option> \
|
||||
<option value="es-CL">es-CL: Spanish (Chile)</option> \
|
||||
<option value="es-CO">es-CO: Spanish (Columbia)</option> \
|
||||
<option value="es-ES">es-ES: Spanish (Spain)</option> \
|
||||
<option value="es-MX">es-MX: Spanish (Mexico)</option> \
|
||||
<option value="es-US">es-US: Spanish (United States)</option> \
|
||||
<option value="fi-FI">fi-FI: Finnish (Finland)</option> \
|
||||
<option value="fr-BE">fr-BE: French (Belgium)</option> \
|
||||
<option value="fr-CA">fr-CA: French (Canada)</option> \
|
||||
<option value="fr-CH">fr-CH: French (Switzerland)</option> \
|
||||
<option value="fr-FR">fr-FR: French (France)</option> \
|
||||
<option value="he-IL">he-IL: Hebrew (Israel)</option> \
|
||||
<option value="hi-IN">hi-IN: Hindi (India)</option> \
|
||||
<option value="hu-HU">hu-HU: Hungarian (Hungary)</option> \
|
||||
<option value="id-ID">id-ID: Indonesian (Indonesia)</option> \
|
||||
<option value="it-CH">it-CH: Italian (Switzerland)</option> \
|
||||
<option value="it-IT">it-IT: Italian (Italy)</option> \
|
||||
<option value="ja-JP">ja-JP: Japanese (Japan)</option> \
|
||||
<option value="ko-KR">ko-KR: Korean (Republic of Korea)</option> \
|
||||
<option value="nl-BE">nl-BE: Dutch (Belgium)</option> \
|
||||
<option value="nl-NL">nl-NL: Dutch (The Netherlands)</option> \
|
||||
<option value="no-NO">no-NO: Norwegian (Norway)</option> \
|
||||
<option value="pl-PL">pl-PL: Polish (Poland)</option> \
|
||||
<option value="pt-BR">pt-BR: Portugese (Brazil)</option> \
|
||||
<option value="pt-PT">pt-PT: Portugese (Portugal)</option> \
|
||||
<option value="ro-RO">ro-RO: Romanian (Romania)</option> \
|
||||
<option value="ru-RU">ru-RU: Russian (Russian Federation)</option> \
|
||||
<option value="sk-SK">sk-SK: Slovak (Slovakia)</option> \
|
||||
<option value="sv-SE">sv-SE: Swedish (Sweden)</option> \
|
||||
<option value="ta-IN">ta-IN: Tamil (India)</option> \
|
||||
<option value="ta-LK">ta-LK: Tamil (Sri Lanka)</option> \
|
||||
<option value="th-TH">th-TH: Thai (Thailand)</option> \
|
||||
<option value="tr-TR">tr-TR: Turkish (Turkey)</option> \
|
||||
<option value="zh-CN">zh-CN: Chinese (China)</option> \
|
||||
<option value="zh-HK">zh-HK: Chinese (Hond Kong)</option> \
|
||||
<option value="zh-TW">zh-TW: Chinese (Taiwan)</option> \
|
||||
</select> \
|
||||
'
|
||||
return html
|
||||
}
|
||||
|
||||
onSettingsChange() {
|
||||
// Used when provider settings are updated from UI
|
||||
this.settings.language = $("#speech_recognition_browser_provider_language").val();
|
||||
console.debug(DEBUG_PREFIX+"Change language to",this.settings.language);
|
||||
this.loadSettings(this.settings);
|
||||
}
|
||||
|
||||
static capitalizeInterim(interimTranscript) {
|
||||
let capitalizeIndex = -1;
|
||||
if (interimTranscript.length > 2 && interimTranscript[0] === ' ') capitalizeIndex = 1;
|
||||
else if (interimTranscript.length > 1) capitalizeIndex = 0;
|
||||
if (capitalizeIndex > -1) {
|
||||
const spacing = capitalizeIndex > 0 ? ' '.repeat(capitalizeIndex - 1) : '';
|
||||
const capitalized = interimTranscript[capitalizeIndex].toLocaleUpperCase();
|
||||
const rest = interimTranscript.substring(capitalizeIndex + 1);
|
||||
interimTranscript = spacing + capitalized + rest;
|
||||
}
|
||||
return interimTranscript;
|
||||
}
|
||||
|
||||
static composeValues(previous, interim) {
|
||||
let spacing = '';
|
||||
if (previous.endsWith('.')) spacing = ' ';
|
||||
return previous + spacing + interim;
|
||||
}
|
||||
|
||||
loadSettings(settings) {
|
||||
const processTranscript = this.processTranscriptFunction;
|
||||
|
||||
// Populate Provider UI given input settings
|
||||
if (Object.keys(settings).length == 0) {
|
||||
console.debug(DEBUG_PREFIX+"Using default browser STT settings")
|
||||
}
|
||||
|
||||
// Initialise as defaultSettings
|
||||
this.settings = this.defaultSettings;
|
||||
|
||||
for (const key in settings){
|
||||
if (key in this.settings){
|
||||
this.settings[key] = settings[key]
|
||||
} else {
|
||||
throw `Invalid setting passed to Speech recogniton extension (browser): ${key}`
|
||||
}
|
||||
}
|
||||
|
||||
$("#speech_recognition_browser_provider_language").val(this.settings.language);
|
||||
|
||||
const speechRecognitionSettings = $.extend({
|
||||
grammar: '' // Custom grammar
|
||||
}, options);
|
||||
|
||||
const speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
|
||||
const speechRecognitionList = window.SpeechGrammarList || window.webkitSpeechGrammarList;
|
||||
|
||||
if (!speechRecognition) {
|
||||
console.warn(DEBUG_PREFIX+'Speech recognition is not supported in this browser.');
|
||||
$("#microphone_button").hide();
|
||||
toastr.error("Speech recognition is not supported in this browser, use another browser or another provider of SillyTavern-extras Speech recognition extension.", "Speech recognition activation Failed (Browser)", { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true });
|
||||
return;
|
||||
}
|
||||
|
||||
const recognition = new speechRecognition();
|
||||
|
||||
if (speechRecognitionSettings.grammar && speechRecognitionList) {
|
||||
speechRecognitionList.addFromString(speechRecognitionSettings.grammar, 1);
|
||||
recognition.grammars = speechRecognitionList;
|
||||
}
|
||||
|
||||
recognition.continuous = true;
|
||||
recognition.interimResults = true;
|
||||
recognition.lang = this.settings.language;
|
||||
|
||||
const textarea = $('#send_textarea');
|
||||
const button = $('#microphone_button');
|
||||
|
||||
let listening = false;
|
||||
button.off('click').on("click", function () {
|
||||
if (listening) {
|
||||
recognition.stop();
|
||||
} else {
|
||||
recognition.start();
|
||||
}
|
||||
listening = !listening;
|
||||
});
|
||||
|
||||
let initialText = '';
|
||||
|
||||
recognition.onresult = function (speechEvent) {
|
||||
let finalTranscript = '';
|
||||
let interimTranscript = ''
|
||||
|
||||
for (let i = speechEvent.resultIndex; i < speechEvent.results.length; ++i) {
|
||||
const transcript = speechEvent.results[i][0].transcript;
|
||||
|
||||
if (speechEvent.results[i].isFinal) {
|
||||
let interim = BrowserSttProvider.capitalizeInterim(transcript);
|
||||
if (interim != '') {
|
||||
let final = finalTranscript;
|
||||
final = BrowserSttProvider.composeValues(final, interim);
|
||||
if (final.slice(-1) != '.' & final.slice(-1) != '?') final += '.';
|
||||
finalTranscript = final;
|
||||
recognition.abort();
|
||||
listening = false;
|
||||
}
|
||||
interimTranscript = ' ';
|
||||
} else {
|
||||
interimTranscript += transcript;
|
||||
}
|
||||
}
|
||||
|
||||
interimTranscript = BrowserSttProvider.capitalizeInterim(interimTranscript);
|
||||
|
||||
textarea.val(initialText + finalTranscript + interimTranscript);
|
||||
};
|
||||
|
||||
recognition.onerror = function (event) {
|
||||
console.error('Error occurred in recognition:', event.error);
|
||||
//if ($('#speech_recognition_debug').is(':checked'))
|
||||
// toastr.error('Error occurred in recognition:'+ event.error, 'STT Generation error (Browser)', { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true });
|
||||
};
|
||||
|
||||
recognition.onend = function () {
|
||||
listening = false;
|
||||
button.toggleClass('fa-microphone fa-microphone-slash');
|
||||
const newText = textarea.val().substring(initialText.length);
|
||||
textarea.val(textarea.val().substring(0,initialText.length));
|
||||
processTranscript(newText);
|
||||
|
||||
};
|
||||
|
||||
recognition.onstart = function () {
|
||||
initialText = textarea.val();
|
||||
button.toggleClass('fa-microphone fa-microphone-slash');
|
||||
|
||||
if ($("#speech_recognition_message_mode").val() == "replace") {
|
||||
textarea.val("");
|
||||
initialText = ""
|
||||
}
|
||||
};
|
||||
|
||||
$("#microphone_button").show();
|
||||
|
||||
console.debug(DEBUG_PREFIX+"Browser STT settings loaded")
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -1,110 +1,351 @@
|
|||
// Borrowed from Agnai (AGPLv3)
|
||||
// https://github.com/agnaistic/agnai/blob/dev/web/pages/Chat/components/SpeechRecognitionRecorder.tsx
|
||||
function capitalizeInterim(interimTranscript) {
|
||||
let capitalizeIndex = -1;
|
||||
if (interimTranscript.length > 2 && interimTranscript[0] === ' ') capitalizeIndex = 1;
|
||||
else if (interimTranscript.length > 1) capitalizeIndex = 0;
|
||||
if (capitalizeIndex > -1) {
|
||||
const spacing = capitalizeIndex > 0 ? ' '.repeat(capitalizeIndex - 1) : '';
|
||||
const capitalized = interimTranscript[capitalizeIndex].toLocaleUpperCase();
|
||||
const rest = interimTranscript.substring(capitalizeIndex + 1);
|
||||
interimTranscript = spacing + capitalized + rest;
|
||||
/*
|
||||
TODO:
|
||||
- try pseudo streaming audio by just sending chunk every X seconds and asking VOSK if it is full text.
|
||||
*/
|
||||
|
||||
import { saveSettingsDebounced } from "../../../script.js";
|
||||
import { getContext, getApiUrl, modules, extension_settings, ModuleWorkerWrapper, doExtrasFetch } from "../../extensions.js";
|
||||
import { VoskSttProvider } from './vosk.js'
|
||||
import { WhisperSttProvider } from './whisper.js'
|
||||
import { BrowserSttProvider } from './browser.js'
|
||||
export { MODULE_NAME };
|
||||
|
||||
const MODULE_NAME = 'Speech Recognition';
|
||||
const DEBUG_PREFIX = "<Speech Recognition module> "
|
||||
|
||||
let sttProviders = {
|
||||
None: null,
|
||||
Browser: BrowserSttProvider,
|
||||
Whisper: WhisperSttProvider,
|
||||
Vosk: VoskSttProvider,
|
||||
}
|
||||
|
||||
let sttProvider = null
|
||||
let sttProviderName = "None"
|
||||
|
||||
let audioRecording = false
|
||||
const constraints = { audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } };
|
||||
let audioChunks = [];
|
||||
|
||||
async function processTranscript(transcript) {
|
||||
try {
|
||||
const transcriptOriginal = transcript;
|
||||
let transcriptFormatted = transcriptOriginal.trim();
|
||||
|
||||
if (transcriptFormatted.length > 0)
|
||||
{
|
||||
console.debug(DEBUG_PREFIX+"recorded transcript: \""+transcriptFormatted+"\"");
|
||||
const messageMode = extension_settings.speech_recognition.messageMode;
|
||||
console.debug(DEBUG_PREFIX+"mode: "+messageMode);
|
||||
|
||||
let transcriptLower = transcriptFormatted.toLowerCase()
|
||||
// remove punctuation
|
||||
let transcriptRaw = transcriptLower.replace(/[^\w\s\']|_/g, "").replace(/\s+/g, " ");
|
||||
|
||||
// Check message mapping
|
||||
if (extension_settings.speech_recognition.messageMappingEnabled) {
|
||||
console.debug(DEBUG_PREFIX+"Start searching message mapping into:",transcriptRaw)
|
||||
for (const key in extension_settings.speech_recognition.messageMapping) {
|
||||
console.debug(DEBUG_PREFIX+"message mapping searching: ", key,"=>",extension_settings.speech_recognition.messageMapping[key]);
|
||||
if (transcriptRaw.includes(key)) {
|
||||
var message = extension_settings.speech_recognition.messageMapping[key];
|
||||
console.debug(DEBUG_PREFIX+"message mapping found: ", key,"=>",extension_settings.speech_recognition.messageMapping[key]);
|
||||
$("#send_textarea").val(message);
|
||||
|
||||
if (messageMode == "auto_send") await getContext().generate();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.debug(DEBUG_PREFIX+"no message mapping found, processing transcript as normal message");
|
||||
|
||||
switch (messageMode) {
|
||||
case "auto_send":
|
||||
$('#send_textarea').val("") // clear message area to avoid double message
|
||||
|
||||
console.debug(DEBUG_PREFIX+"Sending message")
|
||||
const context = getContext();
|
||||
const messageText = transcriptFormatted;
|
||||
const message = {
|
||||
name: context.name1,
|
||||
is_user: true,
|
||||
is_name: true,
|
||||
send_date: Date.now(),
|
||||
mes: messageText,
|
||||
};
|
||||
context.chat.push(message);
|
||||
context.addOneMessage(message);
|
||||
|
||||
await context.generate();
|
||||
|
||||
$('#debug_output').text("<SST-module DEBUG>: message sent: \""+ transcriptFormatted +"\"");
|
||||
break;
|
||||
|
||||
case "replace":
|
||||
console.debug(DEBUG_PREFIX+"Replacing message")
|
||||
$('#send_textarea').val(transcriptFormatted);
|
||||
break;
|
||||
|
||||
case "append":
|
||||
console.debug(DEBUG_PREFIX+"Appending message")
|
||||
$('#send_textarea').val($('#send_textarea').val()+" "+transcriptFormatted);
|
||||
break;
|
||||
|
||||
default:
|
||||
console.debug(DEBUG_PREFIX+"Not supported stt message mode: "+messageMode)
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
console.debug(DEBUG_PREFIX+"Empty transcript, do nothing");
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.debug(error);
|
||||
}
|
||||
return interimTranscript;
|
||||
}
|
||||
|
||||
function composeValues(previous, interim) {
|
||||
let spacing = '';
|
||||
if (previous.endsWith('.')) spacing = ' ';
|
||||
return previous + spacing + interim;
|
||||
function loadNavigatorAudioRecording() {
|
||||
if (navigator.mediaDevices.getUserMedia) {
|
||||
console.debug(DEBUG_PREFIX+' getUserMedia supported by browser.');
|
||||
|
||||
let onSuccess = function(stream) {
|
||||
const mediaRecorder = new MediaRecorder(stream);
|
||||
|
||||
$("#microphone_button").off('click').on("click", function() {
|
||||
if (!audioRecording) {
|
||||
mediaRecorder.start();
|
||||
console.debug(mediaRecorder.state);
|
||||
console.debug("recorder started");
|
||||
audioRecording = true;
|
||||
$("#microphone_button").toggleClass('fa-microphone fa-microphone-slash');
|
||||
}
|
||||
else {
|
||||
mediaRecorder.stop();
|
||||
console.debug(mediaRecorder.state);
|
||||
console.debug("recorder stopped");
|
||||
audioRecording = false;
|
||||
$("#microphone_button").toggleClass('fa-microphone fa-microphone-slash');
|
||||
}
|
||||
});
|
||||
|
||||
mediaRecorder.onstop = async function() {
|
||||
console.debug(DEBUG_PREFIX+"data available after MediaRecorder.stop() called: ", audioChunks.length, " chunks");
|
||||
const audioBlob = new Blob(audioChunks, { type: "audio/wav; codecs=0" });
|
||||
audioChunks = [];
|
||||
|
||||
const transcript = await sttProvider.processAudio(audioBlob);
|
||||
|
||||
// TODO: lock and release recording while processing?
|
||||
console.debug(DEBUG_PREFIX+"received transcript:", transcript);
|
||||
processTranscript(transcript);
|
||||
}
|
||||
|
||||
mediaRecorder.ondataavailable = function(e) {
|
||||
audioChunks.push(e.data);
|
||||
}
|
||||
}
|
||||
|
||||
let onError = function(err) {
|
||||
console.debug(DEBUG_PREFIX+"The following error occured: " + err);
|
||||
}
|
||||
|
||||
navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
|
||||
|
||||
} else {
|
||||
console.debug(DEBUG_PREFIX+"getUserMedia not supported on your browser!");
|
||||
toastr.error("getUserMedia not supported", DEBUG_PREFIX+"not supported for your browser.", { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true });
|
||||
}
|
||||
}
|
||||
|
||||
(function ($) {
|
||||
$.fn.speechRecognitionPlugin = function (options) {
|
||||
const settings = $.extend({
|
||||
grammar: '' // Custom grammar
|
||||
}, options);
|
||||
//##############//
|
||||
// STT Provider //
|
||||
//##############//
|
||||
|
||||
const speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
|
||||
const speechRecognitionList = window.SpeechGrammarList || window.webkitSpeechGrammarList;
|
||||
function loadSttProvider(provider) {
|
||||
//Clear the current config and add new config
|
||||
$("#speech_recognition_provider_settings").html("");
|
||||
|
||||
if (!speechRecognition) {
|
||||
console.warn('Speech recognition is not supported in this browser.');
|
||||
return;
|
||||
// Init provider references
|
||||
extension_settings.speech_recognition.currentProvider = provider;
|
||||
sttProviderName = provider;
|
||||
|
||||
if (!(sttProviderName in extension_settings.speech_recognition)) {
|
||||
console.warn(`Provider ${sttProviderName} not in Extension Settings, initiatilizing provider in settings`);
|
||||
extension_settings.speech_recognition[sttProviderName] = {};
|
||||
}
|
||||
|
||||
$('#speech_recognition_provider').val(sttProviderName);
|
||||
|
||||
if (sttProviderName == "None") {
|
||||
$("#microphone_button").hide();
|
||||
$("#speech_recognition_message_mode_div").hide();
|
||||
$("#speech_recognition_message_mapping_div").hide();
|
||||
return;
|
||||
}
|
||||
|
||||
$("#speech_recognition_message_mode_div").show();
|
||||
$("#speech_recognition_message_mapping_div").show();
|
||||
|
||||
sttProvider = new sttProviders[sttProviderName]
|
||||
|
||||
// Init provider settings
|
||||
$('#speech_recognition_provider_settings').append(sttProvider.settingsHtml);
|
||||
|
||||
// Use microphone button as push to talk
|
||||
if (sttProviderName == "Browser") {
|
||||
sttProvider.processTranscriptFunction = processTranscript;
|
||||
sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
|
||||
}
|
||||
else {
|
||||
sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
|
||||
loadNavigatorAudioRecording();
|
||||
|
||||
$("#microphone_button").show();
|
||||
}
|
||||
}
|
||||
|
||||
function onSttProviderChange() {
|
||||
const sttProviderSelection = $('#speech_recognition_provider').val();
|
||||
loadSttProvider(sttProviderSelection);
|
||||
saveSettingsDebounced();
|
||||
}
|
||||
|
||||
function onSttProviderSettingsInput() {
|
||||
sttProvider.onSettingsChange();
|
||||
|
||||
// Persist changes to SillyTavern stt extension settings
|
||||
extension_settings.speech_recognition[sttProviderName] = sttProvider.settings;
|
||||
saveSettingsDebounced();
|
||||
console.info(`Saved settings ${sttProviderName} ${JSON.stringify(sttProvider.settings)}`);
|
||||
}
|
||||
|
||||
//#############################//
|
||||
// Extension UI and Settings //
|
||||
//#############################//
|
||||
|
||||
const defaultSettings = {
|
||||
currentProvider: "None",
|
||||
messageMode: "append",
|
||||
messageMappingText: "",
|
||||
messageMapping: [],
|
||||
messageMappingEnabled: false
|
||||
}
|
||||
|
||||
function loadSettings() {
|
||||
if (Object.keys(extension_settings.speech_recognition).length === 0) {
|
||||
Object.assign(extension_settings.speech_recognition, defaultSettings)
|
||||
}
|
||||
$('#speech_recognition_enabled').prop('checked',extension_settings.speech_recognition.enabled);
|
||||
$('#speech_recognition_message_mode').val(extension_settings.speech_recognition.messageMode);
|
||||
|
||||
if (extension_settings.speech_recognition.messageMappingText.length > 0) {
|
||||
$('#speech_recognition_message_mapping').val(extension_settings.speech_recognition.messageMappingText);
|
||||
}
|
||||
|
||||
$('#speech_recognition_message_mapping_enabled').prop('checked',extension_settings.speech_recognition.messageMappingEnabled);
|
||||
}
|
||||
|
||||
async function onMessageModeChange() {
|
||||
extension_settings.speech_recognition.messageMode = $('#speech_recognition_message_mode').val();
|
||||
|
||||
if(sttProviderName != "Browser" & extension_settings.speech_recognition.messageMode == "auto_send") {
|
||||
$("#speech_recognition_wait_response_div").show()
|
||||
}
|
||||
else {
|
||||
$("#speech_recognition_wait_response_div").hide()
|
||||
}
|
||||
|
||||
saveSettingsDebounced();
|
||||
}
|
||||
|
||||
async function onMessageMappingChange() {
|
||||
let array = $('#speech_recognition_message_mapping').val().split(",");
|
||||
array = array.map(element => {return element.trim();});
|
||||
array = array.filter((str) => str !== '');
|
||||
extension_settings.speech_recognition.messageMapping = {};
|
||||
for (const text of array) {
|
||||
if (text.includes("=")) {
|
||||
const pair = text.toLowerCase().split("=")
|
||||
extension_settings.speech_recognition.messageMapping[pair[0].trim()] = pair[1].trim()
|
||||
console.debug(DEBUG_PREFIX+"Added mapping", pair[0],"=>", extension_settings.speech_recognition.messageMapping[pair[0]]);
|
||||
}
|
||||
|
||||
const recognition = new speechRecognition();
|
||||
|
||||
if (settings.grammar && speechRecognitionList) {
|
||||
speechRecognitionList.addFromString(settings.grammar, 1);
|
||||
recognition.grammars = speechRecognitionList;
|
||||
else {
|
||||
console.debug(DEBUG_PREFIX+"Wrong syntax for message mapping, no '=' found in:", text);
|
||||
}
|
||||
}
|
||||
|
||||
$("#speech_recognition_message_mapping_status").text("Message mapping updated to: "+JSON.stringify(extension_settings.speech_recognition.messageMapping))
|
||||
console.debug(DEBUG_PREFIX+"Updated message mapping", extension_settings.speech_recognition.messageMapping);
|
||||
extension_settings.speech_recognition.messageMappingText = $('#speech_recognition_message_mapping').val()
|
||||
saveSettingsDebounced();
|
||||
}
|
||||
|
||||
recognition.continuous = true;
|
||||
recognition.interimResults = true;
|
||||
// TODO: This should be configurable.
|
||||
recognition.lang = 'en-US'; // Set the language to English (US).
|
||||
async function onMessageMappingEnabledClick() {
|
||||
extension_settings.speech_recognition.messageMappingEnabled = $('#speech_recognition_message_mapping_enabled').is(':checked');
|
||||
saveSettingsDebounced()
|
||||
}
|
||||
|
||||
const $textarea = this;
|
||||
const $button = $('<div class="fa-solid fa-microphone speech-toggle" title="Click to speak"></div>');
|
||||
$(document).ready(function () {
|
||||
function addExtensionControls() {
|
||||
const settingsHtml = `
|
||||
<div id="speech_recognition_settings">
|
||||
<div class="inline-drawer">
|
||||
<div class="inline-drawer-toggle inline-drawer-header">
|
||||
<b>Speech Recognition</b>
|
||||
<div class="inline-drawer-icon fa-solid fa-circle-chevron-down down"></div>
|
||||
</div>
|
||||
<div class="inline-drawer-content">
|
||||
<div>
|
||||
<span>Select Speech-to-text Provider</span> </br>
|
||||
<select id="speech_recognition_provider">
|
||||
</select>
|
||||
</div>
|
||||
<div id="speech_recognition_message_mode_div">
|
||||
<span>Message Mode</span> </br>
|
||||
<select id="speech_recognition_message_mode">
|
||||
<option value="append">Append</option>
|
||||
<option value="replace">Replace</option>
|
||||
<option value="auto_send">Auto send</option>
|
||||
</select>
|
||||
</div>
|
||||
<div id="speech_recognition_message_mapping_div">
|
||||
<span>Message Mapping</span>
|
||||
<textarea id="speech_recognition_message_mapping" class="text_pole textarea_compact" type="text" rows="4" placeholder="Enter comma separated phrases mapping, example:\ncommand delete = /del 2,\nslash delete = /del 2,\nsystem roll = /roll 2d6,\nhey continue = /continue"></textarea>
|
||||
<span id="speech_recognition_message_mapping_status"></span>
|
||||
<label class="checkbox_label" for="speech_recognition_message_mapping_enabled">
|
||||
<input type="checkbox" id="speech_recognition_message_mapping_enabled" name="speech_recognition_message_mapping_enabled">
|
||||
<small>Enable messages mapping</small>
|
||||
</label>
|
||||
</div>
|
||||
<form id="speech_recognition_provider_settings" class="inline-drawer-content">
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
$('#extensions_settings').append(settingsHtml);
|
||||
$('#speech_recognition_provider_settings').on('input', onSttProviderSettingsInput);
|
||||
for (const provider in sttProviders) {
|
||||
$('#speech_recognition_provider').append($("<option />").val(provider).text(provider));
|
||||
console.debug(DEBUG_PREFIX+"added option "+provider);
|
||||
}
|
||||
$('#speech_recognition_provider').on('change', onSttProviderChange);
|
||||
$('#speech_recognition_message_mode').on('change', onMessageModeChange);
|
||||
$('#speech_recognition_message_mapping').on('change', onMessageMappingChange);
|
||||
$('#speech_recognition_message_mapping_enabled').on('click', onMessageMappingEnabledClick);
|
||||
|
||||
const $button = $('<div id="microphone_button" class="fa-solid fa-microphone speech-toggle" title="Click to speak"></div>');
|
||||
$('#send_but_sheld').prepend($button);
|
||||
|
||||
let listening = false;
|
||||
$button.on('click', function () {
|
||||
if (listening) {
|
||||
recognition.stop();
|
||||
} else {
|
||||
recognition.start();
|
||||
}
|
||||
listening = !listening;
|
||||
});
|
||||
}
|
||||
addExtensionControls(); // No init dependencies
|
||||
loadSettings(); // Depends on Extension Controls and loadTtsProvider
|
||||
loadSttProvider(extension_settings.speech_recognition.currentProvider); // No dependencies
|
||||
|
||||
let initialText = '';
|
||||
|
||||
recognition.onresult = function (speechEvent) {
|
||||
let finalTranscript = '';
|
||||
let interimTranscript = ''
|
||||
|
||||
for (let i = speechEvent.resultIndex; i < speechEvent.results.length; ++i) {
|
||||
const transcript = speechEvent.results[i][0].transcript;
|
||||
|
||||
if (speechEvent.results[i].isFinal) {
|
||||
let interim = capitalizeInterim(transcript);
|
||||
if (interim != '') {
|
||||
let final = finalTranscript;
|
||||
final = composeValues(final, interim) + '.';
|
||||
finalTranscript = final;
|
||||
recognition.abort();
|
||||
listening = false;
|
||||
}
|
||||
interimTranscript = ' ';
|
||||
} else {
|
||||
interimTranscript += transcript;
|
||||
}
|
||||
}
|
||||
|
||||
interimTranscript = capitalizeInterim(interimTranscript);
|
||||
|
||||
$textarea.val(initialText + finalTranscript + interimTranscript);
|
||||
};
|
||||
|
||||
recognition.onerror = function (event) {
|
||||
console.error('Error occurred in recognition:', event.error);
|
||||
};
|
||||
|
||||
recognition.onend = function () {
|
||||
listening = false;
|
||||
$button.toggleClass('fa-microphone fa-microphone-slash');
|
||||
};
|
||||
|
||||
recognition.onstart = function () {
|
||||
initialText = $textarea.val();
|
||||
$button.toggleClass('fa-microphone fa-microphone-slash');
|
||||
};
|
||||
};
|
||||
}(jQuery));
|
||||
|
||||
jQuery(() => {
|
||||
const $textarea = $('#send_textarea');
|
||||
$textarea.speechRecognitionPlugin();
|
||||
});
|
||||
//const wrapper = new ModuleWorkerWrapper(moduleWorker);
|
||||
//setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things
|
||||
//moduleWorker();
|
||||
})
|
||||
|
|
|
@ -2,10 +2,13 @@
|
|||
"display_name": "Speech Recognition",
|
||||
"loading_order": 13,
|
||||
"requires": [],
|
||||
"optional": [],
|
||||
"optional": [
|
||||
"vosk-speech-recognition",
|
||||
"whisper-speech-recognition"
|
||||
],
|
||||
"js": "index.js",
|
||||
"css": "style.css",
|
||||
"author": "Cohee#1207",
|
||||
"version": "1.0.0",
|
||||
"author": "Cohee#1207 and Keij#6799",
|
||||
"version": "1.1.0",
|
||||
"homePage": "https://github.com/SillyTavern/SillyTavern"
|
||||
}
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
import { getApiUrl, doExtrasFetch } from "../../extensions.js";
|
||||
export { VoskSttProvider }
|
||||
|
||||
const DEBUG_PREFIX = "<Speech Recognition module (Vosk)> "
|
||||
|
||||
class VoskSttProvider {
|
||||
//########//
|
||||
// Config //
|
||||
//########//
|
||||
|
||||
settings
|
||||
|
||||
defaultSettings = {
|
||||
}
|
||||
|
||||
get settingsHtml() {
|
||||
let html = ""
|
||||
return html
|
||||
}
|
||||
|
||||
onSettingsChange() {
|
||||
// Used when provider settings are updated from UI
|
||||
}
|
||||
|
||||
loadSettings(settings) {
|
||||
// Populate Provider UI given input settings
|
||||
if (Object.keys(settings).length == 0) {
|
||||
console.debug(DEBUG_PREFIX+"Using default vosk STT extension settings")
|
||||
}
|
||||
|
||||
// Only accept keys defined in defaultSettings
|
||||
this.settings = this.defaultSettings
|
||||
|
||||
for (const key in settings){
|
||||
if (key in this.settings){
|
||||
this.settings[key] = settings[key]
|
||||
} else {
|
||||
throw `Invalid setting passed to STT extension: ${key}`
|
||||
}
|
||||
}
|
||||
|
||||
console.debug(DEBUG_PREFIX+"Vosk STT settings loaded")
|
||||
}
|
||||
|
||||
async processAudio(audioblob) {
|
||||
var requestData = new FormData();
|
||||
requestData.append('AudioFile', audioblob, 'record.wav');
|
||||
|
||||
const url = new URL(getApiUrl());
|
||||
url.pathname = '/api/speech-recognition/vosk/process-audio';
|
||||
|
||||
const apiResult = await doExtrasFetch(url, {
|
||||
method: 'POST',
|
||||
body: requestData,
|
||||
});
|
||||
|
||||
if (!apiResult.ok) {
|
||||
toastr.error(apiResult.statusText, 'STT Generation Failed (Vosk)', { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true });
|
||||
throw new Error(`HTTP ${apiResult.status}: ${await apiResult.text()}`);
|
||||
}
|
||||
|
||||
const result = await apiResult.json();
|
||||
return result.transcript;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
import { getApiUrl, doExtrasFetch } from "../../extensions.js";
|
||||
export { WhisperSttProvider }
|
||||
|
||||
const DEBUG_PREFIX = "<Speech Recognition module (Vosk)> "
|
||||
|
||||
class WhisperSttProvider {
|
||||
//########//
|
||||
// Config //
|
||||
//########//
|
||||
|
||||
settings
|
||||
|
||||
defaultSettings = {
|
||||
//model_path: "",
|
||||
}
|
||||
|
||||
get settingsHtml() {
|
||||
let html = ""
|
||||
return html
|
||||
}
|
||||
|
||||
onSettingsChange() {
|
||||
// Used when provider settings are updated from UI
|
||||
}
|
||||
|
||||
loadSettings(settings) {
|
||||
// Populate Provider UI given input settings
|
||||
if (Object.keys(settings).length == 0) {
|
||||
console.debug(DEBUG_PREFIX+"Using default Whisper STT extension settings")
|
||||
}
|
||||
|
||||
// Only accept keys defined in defaultSettings
|
||||
this.settings = this.defaultSettings
|
||||
|
||||
for (const key in settings){
|
||||
if (key in this.settings){
|
||||
this.settings[key] = settings[key]
|
||||
} else {
|
||||
throw `Invalid setting passed to STT extension: ${key}`
|
||||
}
|
||||
}
|
||||
|
||||
console.debug(DEBUG_PREFIX+"Whisper STT settings loaded")
|
||||
}
|
||||
|
||||
async processAudio(audioblob) {
|
||||
var requestData = new FormData();
|
||||
requestData.append('AudioFile', audioblob, 'record.wav');
|
||||
|
||||
const url = new URL(getApiUrl());
|
||||
url.pathname = '/api/speech-recognition/whisper/process-audio';
|
||||
|
||||
const apiResult = await doExtrasFetch(url, {
|
||||
method: 'POST',
|
||||
body: requestData,
|
||||
});
|
||||
|
||||
if (!apiResult.ok) {
|
||||
toastr.error(apiResult.statusText, 'STT Generation Failed (Whisper)', { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true });
|
||||
throw new Error(`HTTP ${apiResult.status}: ${await apiResult.text()}`);
|
||||
}
|
||||
|
||||
const result = await apiResult.json();
|
||||
return result.transcript;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,7 +1,15 @@
|
|||
import { eventSource, event_types } from "../../../script.js"
|
||||
import { doExtrasFetch, getApiUrl, modules } from "../../extensions.js"
|
||||
|
||||
export { CoquiTtsProvider }
|
||||
|
||||
function throwIfModuleMissing() {
|
||||
if (!modules.includes('coqui-tts')) {
|
||||
toastr.error(`Coqui TTS module not loaded. Add coqui-tts to enable-modules and restart the Extras API.`)
|
||||
throw new Error(`Coqui TTS module not loaded.`)
|
||||
}
|
||||
}
|
||||
|
||||
class CoquiTtsProvider {
|
||||
//########//
|
||||
// Config //
|
||||
|
@ -12,51 +20,45 @@ class CoquiTtsProvider {
|
|||
separator = ' .. '
|
||||
|
||||
defaultSettings = {
|
||||
provider_endpoint: "http://localhost:5100",
|
||||
voiceMap: {}
|
||||
}
|
||||
|
||||
|
||||
get settingsHtml() {
|
||||
let html = `
|
||||
<div style="display: flex; width: 100%;">
|
||||
<div style="flex: 80%;">
|
||||
<label for="model">Model:</label>
|
||||
<select id="model">
|
||||
<option value="none">Select Model</option>
|
||||
<!-- Add more model options here -->
|
||||
</select>
|
||||
<div class="flex wide100p flexGap10 alignitemscenter">
|
||||
<div style="flex: 80%;">
|
||||
<label for="coqui_model">Model:</label>
|
||||
<select id="coqui_model">
|
||||
<option value="none">Select Model</option>
|
||||
<!-- Add more model options here -->
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex justifyCenter" style="flex: 20%;">
|
||||
<button id="coqui_preview" class="menu_button menu_button_icon wide100p" type="button">
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<div style="flex: 20%; display: flex; justify-content: center;">
|
||||
<button id="preview" class="menu_button" type="button" style="width: 100%;">Play</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div style="display: flex; width: 100%;">
|
||||
<div style="flex: 1; margin-right: 10px;">
|
||||
<label for="speaker">Speaker:</label>
|
||||
<select id="speaker">
|
||||
<!-- Add more speaker options here -->
|
||||
</select>
|
||||
</div>
|
||||
<div style="flex: 1;">
|
||||
<label for="language">Language:</label>
|
||||
<select id="language">
|
||||
<!-- Add more language options here -->
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<label for="Coqui_tts_endpoint">Provider Endpoint:</label>
|
||||
<input id="Coqui_tts_endpoint" type="text" class="text_pole" maxlength="250" value="${this.defaultSettings.provider_endpoint}"/>
|
||||
<div class="flex wide100p flexGap10">
|
||||
<div class="flex1">
|
||||
<label for="coqui_speaker">Speaker:</label>
|
||||
<select id="coqui_speaker">
|
||||
<!-- Add more speaker options here -->
|
||||
</select>
|
||||
</div>
|
||||
<div class="flex1">
|
||||
<label for="coqui_language">Language:</label>
|
||||
<select id="coqui_language">
|
||||
<!-- Add more language options here -->
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
`
|
||||
return html
|
||||
}
|
||||
|
||||
onSettingsChange() {
|
||||
// Used when provider settings are updated from UI
|
||||
this.settings.provider_endpoint = $('#Coqui_tts_endpoint').val()
|
||||
}
|
||||
|
||||
loadSettings(settings) {
|
||||
|
@ -64,70 +66,61 @@ class CoquiTtsProvider {
|
|||
if (Object.keys(settings).length == 0) {
|
||||
console.info("Using default TTS Provider settings")
|
||||
}
|
||||
|
||||
const modelSelect = document.getElementById('model');
|
||||
const previewButton = document.getElementById('preview');
|
||||
|
||||
const modelSelect = document.getElementById('coqui_model');
|
||||
const previewButton = document.getElementById('coqui_preview');
|
||||
previewButton.addEventListener('click', () => {
|
||||
const selectedModel = modelSelect.value;
|
||||
this.sampleTtsVoice(selectedModel);
|
||||
});//add event listener to button
|
||||
|
||||
|
||||
|
||||
previewButton.disabled = true;
|
||||
previewButton.innerText = "Select Model";
|
||||
|
||||
|
||||
// Only accept keys defined in defaultSettings
|
||||
this.settings = this.defaultSettings
|
||||
|
||||
for (const key in settings){
|
||||
if (key in this.settings){
|
||||
for (const key in settings) {
|
||||
if (key in this.settings) {
|
||||
this.settings[key] = settings[key]
|
||||
} else {
|
||||
throw `Invalid setting passed to TTS Provider: ${key}`
|
||||
}
|
||||
}
|
||||
|
||||
const apiCheckInterval = setInterval(() => {
|
||||
// Use Extras API if TTS support is enabled
|
||||
if (modules.includes('tts') || modules.includes('Coqui-tts')) {
|
||||
const baseUrl = new URL(getApiUrl());
|
||||
baseUrl.pathname = '/api/coqui-tts/coqui-tts';
|
||||
this.settings.provider_endpoint = baseUrl.toString();
|
||||
$('#Coqui_tts_endpoint').val(this.settings.provider_endpoint);
|
||||
clearInterval(apiCheckInterval);
|
||||
}
|
||||
}, 2000);
|
||||
|
||||
$('#Coqui_tts_endpoint').val(this.settings.provider_endpoint)
|
||||
|
||||
const textexample = document.getElementById('tts_voice_map');
|
||||
textexample.placeholder = 'Enter comma separated map of charName:ttsName[speakerID][langID]. Example: \nAqua:tts_models--en--ljspeech--glow-tts\model_file.pth,\nDarkness:tts_models--multilingual--multi-dataset--your_tts\model_file.pth[2][3]';
|
||||
|
||||
//Load models function
|
||||
this.getModels();
|
||||
eventSource.on(event_types.EXTRAS_CONNECTED, () => {
|
||||
this.getModels();
|
||||
});
|
||||
this.onttsCoquiHideButtons();
|
||||
console.info("Settings loaded")
|
||||
}
|
||||
|
||||
async onttsCoquiHideButtons(){
|
||||
async onttsCoquiHideButtons() {
|
||||
// Get references to the select element and the two input elements
|
||||
const ttsProviderSelect = document.getElementById('tts_provider');
|
||||
const ttsVoicesInput = document.getElementById('tts_voices');
|
||||
const ttsPreviewInput = document.getElementById('tts_preview');
|
||||
|
||||
|
||||
ttsProviderSelect.addEventListener('click', () => {
|
||||
this.getModels();
|
||||
});
|
||||
|
||||
// Add an event listener to the 'change' event of the tts_provider select element
|
||||
ttsProviderSelect.addEventListener('change', () => {
|
||||
// Check if the selected value is 'Coqui'
|
||||
if (ttsProviderSelect.value === 'Coqui') {
|
||||
ttsVoicesInput.style.display = 'none'; // Hide the tts_voices input
|
||||
ttsPreviewInput.style.display = ''; // Show the tts_preview input
|
||||
} else {
|
||||
ttsVoicesInput.style.display = ''; // Show the tts_voices input
|
||||
ttsPreviewInput.style.display = 'none'; // Hide the tts_preview input
|
||||
}
|
||||
// Check if the selected value is 'Coqui'
|
||||
if (ttsProviderSelect.value === 'Coqui') {
|
||||
ttsVoicesInput.style.display = 'none'; // Hide the tts_voices input
|
||||
ttsPreviewInput.style.display = ''; // Show the tts_preview input
|
||||
} else {
|
||||
ttsVoicesInput.style.display = ''; // Show the tts_voices input
|
||||
ttsPreviewInput.style.display = 'none'; // Hide the tts_preview input
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async onApplyClick() {
|
||||
return
|
||||
|
@ -135,262 +128,267 @@ class CoquiTtsProvider {
|
|||
|
||||
async getLang() {
|
||||
try {
|
||||
const response = await fetch(`${this.settings.provider_endpoint}/api/coqui-tts/multlang`);
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
const voiceData = await response.json();
|
||||
|
||||
const modelSelect = document.getElementById('language');
|
||||
modelSelect.innerHTML = ''; // Clear existing options
|
||||
|
||||
if (Object.keys(voiceData).length === 0) {
|
||||
const option = document.createElement('option');
|
||||
option.value = 'none';
|
||||
option.textContent = 'None';
|
||||
modelSelect.appendChild(option);
|
||||
} else {
|
||||
for (const [key, value] of Object.entries(voiceData)) {
|
||||
const option = document.createElement('option');
|
||||
option.value = key;
|
||||
option.textContent = key + ": " + value;
|
||||
modelSelect.appendChild(option);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
//console.error('Error fetching voice data:', error);
|
||||
|
||||
// Remove all options except "None"
|
||||
const modelSelect = document.getElementById('language');
|
||||
modelSelect.innerHTML = '';
|
||||
|
||||
const option = document.createElement('option');
|
||||
option.value = 'none';
|
||||
option.textContent = 'None';
|
||||
modelSelect.appendChild(option);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async getSpeakers() {
|
||||
try {
|
||||
const response = await fetch(`${this.settings.provider_endpoint}/api/coqui-tts/multspeaker`);
|
||||
const response = await doExtrasFetch(`${getApiUrl()}/api/coqui-tts/multlang`);
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
const voiceData = await response.json();
|
||||
|
||||
const modelSelect = document.getElementById('speaker');
|
||||
modelSelect.innerHTML = ''; // Clear existing options
|
||||
|
||||
|
||||
const languageSelect = document.getElementById('coqui_language');
|
||||
languageSelect.innerHTML = ''; // Clear existing options
|
||||
|
||||
if (Object.keys(voiceData).length === 0) {
|
||||
const option = document.createElement('option');
|
||||
option.value = 'none';
|
||||
option.textContent = 'None';
|
||||
modelSelect.appendChild(option);
|
||||
} else {
|
||||
for (const [index, name] of Object.entries(voiceData)) {
|
||||
const option = document.createElement('option');
|
||||
option.value = index;
|
||||
option.textContent = index + ": " + name;
|
||||
modelSelect.appendChild(option);
|
||||
}
|
||||
option.value = 'none';
|
||||
option.textContent = 'None';
|
||||
languageSelect.appendChild(option);
|
||||
} else {
|
||||
for (const [key, value] of Object.entries(voiceData)) {
|
||||
const option = document.createElement('option');
|
||||
option.value = key;
|
||||
option.textContent = key + ": " + value;
|
||||
languageSelect.appendChild(option);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
} catch (error) {
|
||||
//console.error('Error fetching voice data:', error);
|
||||
|
||||
|
||||
// Remove all options except "None"
|
||||
const modelSelect = document.getElementById('speaker');
|
||||
modelSelect.innerHTML = '';
|
||||
|
||||
const languageSelect = document.getElementById('coqui_language');
|
||||
languageSelect.innerHTML = '';
|
||||
|
||||
const option = document.createElement('option');
|
||||
option.value = 'none';
|
||||
option.textContent = 'None';
|
||||
languageSelect.appendChild(option);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
async getSpeakers() {
|
||||
try {
|
||||
const response = await doExtrasFetch(`${getApiUrl()}/api/coqui-tts/multspeaker`);
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
const voiceData = await response.json();
|
||||
|
||||
const speakerSelect = document.getElementById('coqui_speaker');
|
||||
speakerSelect.innerHTML = ''; // Clear existing options
|
||||
|
||||
if (Object.keys(voiceData).length === 0) {
|
||||
const option = document.createElement('option');
|
||||
option.value = 'none';
|
||||
option.textContent = 'None';
|
||||
speakerSelect.appendChild(option);
|
||||
} else {
|
||||
for (const [index, name] of Object.entries(voiceData)) {
|
||||
const option = document.createElement('option');
|
||||
option.value = index;
|
||||
option.textContent = index + ": " + name;
|
||||
speakerSelect.appendChild(option);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
//console.error('Error fetching voice data:', error);
|
||||
|
||||
// Remove all options except "None"
|
||||
const speakerSelect = document.getElementById('coqui_speaker');
|
||||
speakerSelect.innerHTML = '';
|
||||
|
||||
const option = document.createElement('option');
|
||||
option.value = 'none';
|
||||
option.textContent = 'None';
|
||||
speakerSelect.appendChild(option);
|
||||
}
|
||||
}
|
||||
|
||||
async getModels() {
|
||||
try {
|
||||
throwIfModuleMissing();
|
||||
const response = await doExtrasFetch(`${getApiUrl()}/api/coqui-tts/list`);
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
const voiceIds = await response.json();
|
||||
|
||||
const modelSelect = document.getElementById('coqui_model');
|
||||
if (voiceIds.length === 0) {
|
||||
const option = document.createElement('option');
|
||||
option.value = 'none';
|
||||
option.textContent = 'Select Model';
|
||||
modelSelect.appendChild(option);
|
||||
} else {
|
||||
voiceIds.forEach(voiceId => {
|
||||
const option = document.createElement('option');
|
||||
option.value = voiceId;
|
||||
option.textContent = voiceId;
|
||||
modelSelect.appendChild(option);
|
||||
});
|
||||
}
|
||||
|
||||
// Update provider endpoint on model selection change
|
||||
modelSelect.addEventListener('change', () => {
|
||||
const selectedModel = modelSelect.value;
|
||||
this.LoadModel(selectedModel);
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error fetching voice IDs:', error);
|
||||
|
||||
// Add "None" option when the request fails or the response is empty
|
||||
const modelSelect = document.getElementById('coqui_model');
|
||||
const option = document.createElement('option');
|
||||
option.value = 'none';
|
||||
option.textContent = 'None';
|
||||
modelSelect.appendChild(option);
|
||||
}
|
||||
}
|
||||
|
||||
async getModels() {
|
||||
try {
|
||||
const response = await fetch(`${this.settings.provider_endpoint}/api/coqui-tts/list`);
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
const voiceIds = await response.json();
|
||||
|
||||
const modelSelect = document.getElementById('model');
|
||||
if (voiceIds.length === 0) {
|
||||
const option = document.createElement('option');
|
||||
option.value = 'none';
|
||||
option.textContent = 'Select Model';
|
||||
modelSelect.appendChild(option);
|
||||
} else {
|
||||
voiceIds.forEach(voiceId => {
|
||||
const option = document.createElement('option');
|
||||
option.value = voiceId;
|
||||
option.textContent = voiceId;
|
||||
modelSelect.appendChild(option);
|
||||
});
|
||||
}
|
||||
|
||||
// Update provider endpoint on model selection change
|
||||
modelSelect.addEventListener('change', () => {
|
||||
const selectedModel = modelSelect.value;
|
||||
this.LoadModel(selectedModel);
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error fetching voice IDs:', error);
|
||||
|
||||
// Add "None" option when the request fails or the response is empty
|
||||
const modelSelect = document.getElementById('model');
|
||||
const option = document.createElement('option');
|
||||
option.value = 'none';
|
||||
option.textContent = 'None';
|
||||
modelSelect.appendChild(option);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async LoadModel(selectedModel) {
|
||||
const previewButton = document.getElementById('preview');
|
||||
async LoadModel(selectedModel) {
|
||||
const previewButton = document.getElementById('coqui_preview');
|
||||
previewButton.disabled = true;
|
||||
previewButton.innerText = "Loading";
|
||||
try {
|
||||
const response = await fetch(`${this.defaultSettings.provider_endpoint}/api/coqui-tts/load?_model=${selectedModel}`);
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
this.getSpeakers();
|
||||
this.getLang();
|
||||
throwIfModuleMissing();
|
||||
const response = await doExtrasFetch(`${getApiUrl()}/api/coqui-tts/load?_model=${selectedModel}`);
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
||||
}
|
||||
this.getSpeakers();
|
||||
this.getLang();
|
||||
|
||||
const previewButton = document.getElementById('preview');
|
||||
previewButton.disabled = false;
|
||||
previewButton.innerText = "Play";
|
||||
const previewButton = document.getElementById('coqui_preview');
|
||||
previewButton.disabled = false;
|
||||
previewButton.innerText = "Play";
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error updating provider endpoint:', error);
|
||||
console.error('Error updating provider endpoint:', error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async getVoice(voiceName) {
|
||||
async getVoice(voiceName) {
|
||||
//tts_models--multilingual--multi-dataset--your_tts\model_file.pth[2][1]
|
||||
//tts_models--en--ljspeech--glow-tts\model_file.pth
|
||||
|
||||
|
||||
let _voiceNameOrg = voiceName; // Store the original voiceName in a variable _voiceNameOrg
|
||||
voiceName = voiceName.replace(/(\[\d+\])+$/, ''); // For example, converts 'model[2][1]' to 'model'
|
||||
|
||||
|
||||
this.voices = []; //reset for follow up runs
|
||||
|
||||
if (this.voices.length === 0) { this.voices = await this.fetchCheckMap(); }
|
||||
|
||||
|
||||
// Search for a voice object in the 'this.voices' array where the 'name' property matches the provided 'voiceName'
|
||||
|
||||
|
||||
//const match = this.voices.find((CoquiVoice) => CoquiVoice.name === voiceName);
|
||||
const match = this.voices.find((CoquiVoice) => CoquiVoice.name === voiceName);
|
||||
|
||||
// If no match is found, throw an error indicating that the TTS Voice name was not found
|
||||
if (!match) {
|
||||
throw new Error(`TTS Voice name ${voiceName} not found`);
|
||||
throw new Error(`TTS Voice name ${voiceName} not found`);
|
||||
} else {
|
||||
match.name = _voiceNameOrg;
|
||||
match.voice_id = _voiceNameOrg;
|
||||
match.name = _voiceNameOrg;
|
||||
match.voice_id = _voiceNameOrg;
|
||||
}
|
||||
// Return the matched voice object (with the 'name' property updated if a match was found)
|
||||
return match;
|
||||
}
|
||||
}
|
||||
|
||||
async fetchCheckMap() {
|
||||
const endpoint = `${this.settings.provider_endpoint}/api/coqui-tts/checkmap`;
|
||||
const response = await fetch(endpoint);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${await response.json()}`);
|
||||
}
|
||||
const voiceData = await response.json();
|
||||
const voices = voiceData.map((voice) => ({
|
||||
id: voice.name,
|
||||
name: voice.id, // this is the issue!!!
|
||||
voice_id: voice.id, // this is the issue!!!
|
||||
//preview_url: false,
|
||||
lang: voice.lang,
|
||||
}));
|
||||
return voices;
|
||||
}
|
||||
|
||||
async fetchTtsVoiceIds() {
|
||||
const endpoint = `${this.settings.provider_endpoint}/api/coqui-tts/speaker_id`;
|
||||
const response = await fetch(endpoint);
|
||||
const endpoint = `${getApiUrl()}/api/coqui-tts/checkmap`;
|
||||
const response = await doExtrasFetch(endpoint);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${await response.json()}`);
|
||||
}
|
||||
const voiceData = await response.json();
|
||||
const voices = voiceData.map((voice) => ({
|
||||
id: voice.name,
|
||||
name: voice.id, //add filename here
|
||||
voice_id: voice.id,
|
||||
//preview_url: false,
|
||||
//preview_url: `${this.settings.provider_endpoint}/api/coqui-tts/download?model=${voice.id}`,
|
||||
//http://localhost:5100/api/coqui-tts/speaker_id/tts_models/en/ljspeech/speedy-speech
|
||||
lang: voice.lang,
|
||||
}));
|
||||
return voices;
|
||||
}
|
||||
|
||||
sampleTtsVoice(voiceId) {
|
||||
// Get the selected values of speaker and language
|
||||
const speakerSelect = document.getElementById('speaker');
|
||||
const languageSelect = document.getElementById('language');
|
||||
const selectedSpeaker = speakerSelect.value;
|
||||
const selectedLanguage = languageSelect.value;
|
||||
|
||||
// Construct the URL with the selected values
|
||||
const url = `${this.settings.provider_endpoint}/api/coqui-tts/tts?text=The%20Quick%20Brown%20Fox%20Jumps%20Over%20the%20Lazy%20Dog.&speaker_id=${voiceId}&style_wav=&language_id=${selectedLanguage}&mspker=${selectedSpeaker}`;
|
||||
|
||||
fetch(url)
|
||||
.then(response => response.blob())
|
||||
.then(blob => {
|
||||
const audioUrl = URL.createObjectURL(blob);
|
||||
// Play the audio
|
||||
const audio = new Audio(audioUrl);
|
||||
audio.play();
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error performing TTS request:', error);
|
||||
});
|
||||
}
|
||||
|
||||
previewTtsVoice(voiceId) { //button on avail voices
|
||||
const url = `${this.settings.provider_endpoint}/api/coqui-tts/download?model=${voiceId}`;
|
||||
|
||||
fetch(url)
|
||||
.then(response => response.text()) // Expecting a text response
|
||||
.then(responseText => {
|
||||
const isResponseTrue = responseText.trim().toLowerCase() === 'true';
|
||||
|
||||
if (isResponseTrue) {
|
||||
console.log("Downloading Model") //if true
|
||||
} else {
|
||||
console.error('Already Installed'); //if false
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${await response.json()}`);
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error performing download:', error);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
async generateTts(text, voiceId){
|
||||
const voiceData = await response.json();
|
||||
const voices = voiceData.map((voice) => ({
|
||||
id: voice.name,
|
||||
name: voice.id, // this is the issue!!!
|
||||
voice_id: voice.id, // this is the issue!!!
|
||||
//preview_url: false,
|
||||
lang: voice.lang,
|
||||
}));
|
||||
return voices;
|
||||
}
|
||||
|
||||
async fetchTtsVoiceIds() {
|
||||
throwIfModuleMissing();
|
||||
const endpoint = `${getApiUrl()}/api/coqui-tts/speaker_id`;
|
||||
const response = await doExtrasFetch(endpoint);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}: ${await response.json()}`);
|
||||
}
|
||||
const voiceData = await response.json();
|
||||
const voices = voiceData.map((voice) => ({
|
||||
id: voice.name,
|
||||
name: voice.id, //add filename here
|
||||
voice_id: voice.id,
|
||||
//preview_url: false,
|
||||
//preview_url: `${getApiUrl()}/api/coqui-tts/download?model=${voice.id}`,
|
||||
//http://localhost:5100/api/coqui-tts/speaker_id/tts_models/en/ljspeech/speedy-speech
|
||||
lang: voice.lang,
|
||||
}));
|
||||
return voices;
|
||||
}
|
||||
|
||||
sampleTtsVoice(voiceId) {
|
||||
// Get the selected values of speaker and language
|
||||
const speakerSelect = document.getElementById('coqui_speaker');
|
||||
const languageSelect = document.getElementById('coqui_language');
|
||||
const selectedSpeaker = speakerSelect.value;
|
||||
const selectedLanguage = languageSelect.value;
|
||||
|
||||
// Construct the URL with the selected values
|
||||
const url = `${getApiUrl()}/api/coqui-tts/tts?text=The%20Quick%20Brown%20Fox%20Jumps%20Over%20the%20Lazy%20Dog.&speaker_id=${voiceId}&style_wav=&language_id=${selectedLanguage}&mspker=${selectedSpeaker}`;
|
||||
|
||||
doExtrasFetch(url)
|
||||
.then(response => response.blob())
|
||||
.then(blob => {
|
||||
const audioUrl = URL.createObjectURL(blob);
|
||||
// Play the audio
|
||||
const audio = new Audio(audioUrl);
|
||||
audio.play();
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error performing TTS request:', error);
|
||||
});
|
||||
}
|
||||
|
||||
previewTtsVoice(voiceId) { //button on avail voices
|
||||
throwIfModuleMissing();
|
||||
const url = `${getApiUrl()}/api/coqui-tts/download?model=${voiceId}`;
|
||||
|
||||
doExtrasFetch(url)
|
||||
.then(response => response.text()) // Expecting a text response
|
||||
.then(responseText => {
|
||||
const isResponseTrue = responseText.trim().toLowerCase() === 'true';
|
||||
|
||||
if (isResponseTrue) {
|
||||
console.log("Downloading Model") //if true
|
||||
} else {
|
||||
console.error('Already Installed'); //if false
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error performing download:', error);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
async generateTts(text, voiceId) {
|
||||
const response = await this.fetchTtsGeneration(text, voiceId)
|
||||
return response
|
||||
}
|
||||
|
||||
async fetchTtsGeneration(inputText, voiceId) {
|
||||
console.info(`Generating new TTS for voice_id ${voiceId}`);
|
||||
const response = await fetch(`${this.settings.provider_endpoint}/api/coqui-tts/tts?text=${encodeURIComponent(inputText)}&speaker_id=${voiceId}`);
|
||||
if (!response.ok) {
|
||||
toastr.error(response.statusText, 'TTS Generation Failed');
|
||||
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
|
||||
}
|
||||
async fetchTtsGeneration(inputText, voiceId) {
|
||||
throwIfModuleMissing();
|
||||
console.info(`Generating new TTS for voice_id ${voiceId}`);
|
||||
const response = await doExtrasFetch(`${getApiUrl()}/api/coqui-tts/tts?text=${encodeURIComponent(inputText)}&speaker_id=${voiceId}`);
|
||||
if (!response.ok) {
|
||||
toastr.error(response.statusText, 'TTS Generation Failed');
|
||||
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
|
||||
}
|
||||
if (!response.ok) {
|
||||
toastr.error(response.statusText, 'TTS Generation Failed');
|
||||
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
|
||||
|
|
|
@ -4243,6 +4243,10 @@ toolcool-color-picker {
|
|||
padding: 5px;
|
||||
}
|
||||
|
||||
.flex {
|
||||
display: flex;
|
||||
}
|
||||
|
||||
.flex-container {
|
||||
display: flex;
|
||||
gap: 5px;
|
||||
|
@ -4849,6 +4853,10 @@ body.waifuMode .zoomed_avatar {
|
|||
gap: 5px;
|
||||
}
|
||||
|
||||
.flexGap10 {
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.timestamp {
|
||||
font-size: calc(var(--mainFontSize) * 0.7);
|
||||
font-weight: 400;
|
||||
|
|
Loading…
Reference in New Issue