Merge pull request #795 from Tony-sama/staging
Feature: Speech-to-text module using Vosk, Whisper. (one-hand mod only)
This commit is contained in:
commit
77124056b8
|
@ -73,6 +73,7 @@ const extension_settings = {
|
|||
fluctuation: 0.1,
|
||||
enabled: false,
|
||||
},
|
||||
speech_recognition: {},
|
||||
};
|
||||
|
||||
let modules = [];
|
||||
|
|
|
@ -0,0 +1,233 @@
|
|||
// Borrowed from Agnai (AGPLv3)
|
||||
// https://github.com/agnaistic/agnai/blob/dev/web/pages/Chat/components/SpeechRecognitionRecorder.tsx
|
||||
// First version by Cohee#1207
|
||||
// Adapted by Tony-sama
|
||||
|
||||
export { BrowserSttProvider }
|
||||
|
||||
const DEBUG_PREFIX = "<Speech Recognition module (Browser)> "
|
||||
|
||||
class BrowserSttProvider {
|
||||
//########//
|
||||
// Config //
|
||||
//########//
|
||||
|
||||
settings = {
|
||||
language: ""
|
||||
}
|
||||
|
||||
defaultSettings = {
|
||||
language: "en-US",
|
||||
}
|
||||
|
||||
processTranscriptFunction = null;
|
||||
|
||||
get settingsHtml() {
|
||||
let html = ' \
|
||||
<span>Language</span> </br> \
|
||||
<select id="speech_recognition_browser_provider_language"> \
|
||||
<option value="ar-SA">ar-SA: Arabic (Saudi Arabia)</option> \
|
||||
<option value="bn-BD">bn-BD: Bangla (Bangladesh)</option> \
|
||||
<option value="bn-IN">bn-IN: Bangla (India)</option> \
|
||||
<option value="cs-CZ">cs-CZ: Czech (Czech Republic)</option> \
|
||||
<option value="da-DK">da-DK: Danish (Denmark)</option> \
|
||||
<option value="de-AT">de-AT: German (Austria)</option> \
|
||||
<option value="de-CH">de-CH: German (Switzerland)</option> \
|
||||
<option value="de-DE">de-DE: German (Germany)</option> \
|
||||
<option value="el-GR">el-GR: Greek (Greece)</option> \
|
||||
<option value="en-AU">en-AU: English (Australia)</option> \
|
||||
<option value="en-CA">en-CA: English (Canada)</option> \
|
||||
<option value="en-GB">en-GB: English (United Kingdom)</option> \
|
||||
<option value="en-IE">en-IE: English (Ireland)</option> \
|
||||
<option value="en-IN">en-IN: English (India)</option> \
|
||||
<option value="en-NZ">en-NZ: English (New Zealand)</option> \
|
||||
<option value="en-US">en-US: English (United States)</option> \
|
||||
<option value="en-ZA">en-ZA: English (South Africa)</option> \
|
||||
<option value="es-AR">es-AR: Spanish (Argentina)</option> \
|
||||
<option value="es-CL">es-CL: Spanish (Chile)</option> \
|
||||
<option value="es-CO">es-CO: Spanish (Columbia)</option> \
|
||||
<option value="es-ES">es-ES: Spanish (Spain)</option> \
|
||||
<option value="es-MX">es-MX: Spanish (Mexico)</option> \
|
||||
<option value="es-US">es-US: Spanish (United States)</option> \
|
||||
<option value="fi-FI">fi-FI: Finnish (Finland)</option> \
|
||||
<option value="fr-BE">fr-BE: French (Belgium)</option> \
|
||||
<option value="fr-CA">fr-CA: French (Canada)</option> \
|
||||
<option value="fr-CH">fr-CH: French (Switzerland)</option> \
|
||||
<option value="fr-FR">fr-FR: French (France)</option> \
|
||||
<option value="he-IL">he-IL: Hebrew (Israel)</option> \
|
||||
<option value="hi-IN">hi-IN: Hindi (India)</option> \
|
||||
<option value="hu-HU">hu-HU: Hungarian (Hungary)</option> \
|
||||
<option value="id-ID">id-ID: Indonesian (Indonesia)</option> \
|
||||
<option value="it-CH">it-CH: Italian (Switzerland)</option> \
|
||||
<option value="it-IT">it-IT: Italian (Italy)</option> \
|
||||
<option value="ja-JP">ja-JP: Japanese (Japan)</option> \
|
||||
<option value="ko-KR">ko-KR: Korean (Republic of Korea)</option> \
|
||||
<option value="nl-BE">nl-BE: Dutch (Belgium)</option> \
|
||||
<option value="nl-NL">nl-NL: Dutch (The Netherlands)</option> \
|
||||
<option value="no-NO">no-NO: Norwegian (Norway)</option> \
|
||||
<option value="pl-PL">pl-PL: Polish (Poland)</option> \
|
||||
<option value="pt-BR">pt-BR: Portugese (Brazil)</option> \
|
||||
<option value="pt-PT">pt-PT: Portugese (Portugal)</option> \
|
||||
<option value="ro-RO">ro-RO: Romanian (Romania)</option> \
|
||||
<option value="ru-RU">ru-RU: Russian (Russian Federation)</option> \
|
||||
<option value="sk-SK">sk-SK: Slovak (Slovakia)</option> \
|
||||
<option value="sv-SE">sv-SE: Swedish (Sweden)</option> \
|
||||
<option value="ta-IN">ta-IN: Tamil (India)</option> \
|
||||
<option value="ta-LK">ta-LK: Tamil (Sri Lanka)</option> \
|
||||
<option value="th-TH">th-TH: Thai (Thailand)</option> \
|
||||
<option value="tr-TR">tr-TR: Turkish (Turkey)</option> \
|
||||
<option value="zh-CN">zh-CN: Chinese (China)</option> \
|
||||
<option value="zh-HK">zh-HK: Chinese (Hond Kong)</option> \
|
||||
<option value="zh-TW">zh-TW: Chinese (Taiwan)</option> \
|
||||
</select> \
|
||||
'
|
||||
return html
|
||||
}
|
||||
|
||||
onSettingsChange() {
|
||||
// Used when provider settings are updated from UI
|
||||
this.settings.language = $("#speech_recognition_browser_provider_language").val();
|
||||
console.debug(DEBUG_PREFIX+"Change language to",this.settings.language);
|
||||
this.loadSettings(this.settings);
|
||||
}
|
||||
|
||||
static capitalizeInterim(interimTranscript) {
|
||||
let capitalizeIndex = -1;
|
||||
if (interimTranscript.length > 2 && interimTranscript[0] === ' ') capitalizeIndex = 1;
|
||||
else if (interimTranscript.length > 1) capitalizeIndex = 0;
|
||||
if (capitalizeIndex > -1) {
|
||||
const spacing = capitalizeIndex > 0 ? ' '.repeat(capitalizeIndex - 1) : '';
|
||||
const capitalized = interimTranscript[capitalizeIndex].toLocaleUpperCase();
|
||||
const rest = interimTranscript.substring(capitalizeIndex + 1);
|
||||
interimTranscript = spacing + capitalized + rest;
|
||||
}
|
||||
return interimTranscript;
|
||||
}
|
||||
|
||||
static composeValues(previous, interim) {
|
||||
let spacing = '';
|
||||
if (previous.endsWith('.')) spacing = ' ';
|
||||
return previous + spacing + interim;
|
||||
}
|
||||
|
||||
loadSettings(settings) {
|
||||
const processTranscript = this.processTranscriptFunction;
|
||||
|
||||
// Populate Provider UI given input settings
|
||||
if (Object.keys(settings).length == 0) {
|
||||
console.debug(DEBUG_PREFIX+"Using default browser STT settings")
|
||||
}
|
||||
|
||||
// Initialise as defaultSettings
|
||||
this.settings = this.defaultSettings;
|
||||
|
||||
for (const key in settings){
|
||||
if (key in this.settings){
|
||||
this.settings[key] = settings[key]
|
||||
} else {
|
||||
throw `Invalid setting passed to Speech recogniton extension (browser): ${key}`
|
||||
}
|
||||
}
|
||||
|
||||
$("#speech_recognition_browser_provider_language").val(this.settings.language);
|
||||
|
||||
const speechRecognitionSettings = $.extend({
|
||||
grammar: '' // Custom grammar
|
||||
}, options);
|
||||
|
||||
const speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
|
||||
const speechRecognitionList = window.SpeechGrammarList || window.webkitSpeechGrammarList;
|
||||
|
||||
if (!speechRecognition) {
|
||||
console.warn(DEBUG_PREFIX+'Speech recognition is not supported in this browser.');
|
||||
$("#microphone_button").hide();
|
||||
toastr.error("Speech recognition is not supported in this browser, use another browser or another provider of SillyTavern-extras Speech recognition extension.", "Speech recognition activation Failed (Browser)", { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true });
|
||||
return;
|
||||
}
|
||||
|
||||
const recognition = new speechRecognition();
|
||||
|
||||
if (speechRecognitionSettings.grammar && speechRecognitionList) {
|
||||
speechRecognitionList.addFromString(speechRecognitionSettings.grammar, 1);
|
||||
recognition.grammars = speechRecognitionList;
|
||||
}
|
||||
|
||||
recognition.continuous = true;
|
||||
recognition.interimResults = true;
|
||||
recognition.lang = this.settings.language;
|
||||
|
||||
const textarea = $('#send_textarea');
|
||||
const button = $('#microphone_button');
|
||||
|
||||
let listening = false;
|
||||
button.off('click').on("click", function () {
|
||||
if (listening) {
|
||||
recognition.stop();
|
||||
} else {
|
||||
recognition.start();
|
||||
}
|
||||
listening = !listening;
|
||||
});
|
||||
|
||||
let initialText = '';
|
||||
|
||||
recognition.onresult = function (speechEvent) {
|
||||
let finalTranscript = '';
|
||||
let interimTranscript = ''
|
||||
|
||||
for (let i = speechEvent.resultIndex; i < speechEvent.results.length; ++i) {
|
||||
const transcript = speechEvent.results[i][0].transcript;
|
||||
|
||||
if (speechEvent.results[i].isFinal) {
|
||||
let interim = BrowserSttProvider.capitalizeInterim(transcript);
|
||||
if (interim != '') {
|
||||
let final = finalTranscript;
|
||||
final = BrowserSttProvider.composeValues(final, interim);
|
||||
if (final.slice(-1) != '.' & final.slice(-1) != '?') final += '.';
|
||||
finalTranscript = final;
|
||||
recognition.abort();
|
||||
listening = false;
|
||||
}
|
||||
interimTranscript = ' ';
|
||||
} else {
|
||||
interimTranscript += transcript;
|
||||
}
|
||||
}
|
||||
|
||||
interimTranscript = BrowserSttProvider.capitalizeInterim(interimTranscript);
|
||||
|
||||
textarea.val(initialText + finalTranscript + interimTranscript);
|
||||
};
|
||||
|
||||
recognition.onerror = function (event) {
|
||||
console.error('Error occurred in recognition:', event.error);
|
||||
//if ($('#speech_recognition_debug').is(':checked'))
|
||||
// toastr.error('Error occurred in recognition:'+ event.error, 'STT Generation error (Browser)', { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true });
|
||||
};
|
||||
|
||||
recognition.onend = function () {
|
||||
listening = false;
|
||||
button.toggleClass('fa-microphone fa-microphone-slash');
|
||||
const newText = textarea.val().substring(initialText.length);
|
||||
textarea.val(textarea.val().substring(0,initialText.length));
|
||||
processTranscript(newText);
|
||||
|
||||
};
|
||||
|
||||
recognition.onstart = function () {
|
||||
initialText = textarea.val();
|
||||
button.toggleClass('fa-microphone fa-microphone-slash');
|
||||
|
||||
if ($("#speech_recognition_message_mode").val() == "replace") {
|
||||
textarea.val("");
|
||||
initialText = ""
|
||||
}
|
||||
};
|
||||
|
||||
$("#microphone_button").show();
|
||||
|
||||
console.debug(DEBUG_PREFIX+"Browser STT settings loaded")
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -1,110 +1,351 @@
|
|||
// Borrowed from Agnai (AGPLv3)
|
||||
// https://github.com/agnaistic/agnai/blob/dev/web/pages/Chat/components/SpeechRecognitionRecorder.tsx
|
||||
function capitalizeInterim(interimTranscript) {
|
||||
let capitalizeIndex = -1;
|
||||
if (interimTranscript.length > 2 && interimTranscript[0] === ' ') capitalizeIndex = 1;
|
||||
else if (interimTranscript.length > 1) capitalizeIndex = 0;
|
||||
if (capitalizeIndex > -1) {
|
||||
const spacing = capitalizeIndex > 0 ? ' '.repeat(capitalizeIndex - 1) : '';
|
||||
const capitalized = interimTranscript[capitalizeIndex].toLocaleUpperCase();
|
||||
const rest = interimTranscript.substring(capitalizeIndex + 1);
|
||||
interimTranscript = spacing + capitalized + rest;
|
||||
}
|
||||
return interimTranscript;
|
||||
/*
|
||||
TODO:
|
||||
- try pseudo streaming audio by just sending chunk every X seconds and asking VOSK if it is full text.
|
||||
*/
|
||||
|
||||
import { saveSettingsDebounced } from "../../../script.js";
|
||||
import { getContext, getApiUrl, modules, extension_settings, ModuleWorkerWrapper, doExtrasFetch } from "../../extensions.js";
|
||||
import { VoskSttProvider } from './vosk.js'
|
||||
import { WhisperSttProvider } from './whisper.js'
|
||||
import { BrowserSttProvider } from './browser.js'
|
||||
export { MODULE_NAME };
|
||||
|
||||
const MODULE_NAME = 'Speech Recognition';
|
||||
const DEBUG_PREFIX = "<Speech Recognition module> "
|
||||
|
||||
let sttProviders = {
|
||||
None: null,
|
||||
Browser: BrowserSttProvider,
|
||||
Whisper: WhisperSttProvider,
|
||||
Vosk: VoskSttProvider,
|
||||
}
|
||||
|
||||
function composeValues(previous, interim) {
|
||||
let spacing = '';
|
||||
if (previous.endsWith('.')) spacing = ' ';
|
||||
return previous + spacing + interim;
|
||||
let sttProvider = null
|
||||
let sttProviderName = "None"
|
||||
|
||||
let audioRecording = false
|
||||
const constraints = { audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } };
|
||||
let audioChunks = [];
|
||||
|
||||
async function processTranscript(transcript) {
|
||||
try {
|
||||
const transcriptOriginal = transcript;
|
||||
let transcriptFormatted = transcriptOriginal.trim();
|
||||
|
||||
if (transcriptFormatted.length > 0)
|
||||
{
|
||||
console.debug(DEBUG_PREFIX+"recorded transcript: \""+transcriptFormatted+"\"");
|
||||
const messageMode = extension_settings.speech_recognition.messageMode;
|
||||
console.debug(DEBUG_PREFIX+"mode: "+messageMode);
|
||||
|
||||
let transcriptLower = transcriptFormatted.toLowerCase()
|
||||
// remove punctuation
|
||||
let transcriptRaw = transcriptLower.replace(/[^\w\s\']|_/g, "").replace(/\s+/g, " ");
|
||||
|
||||
// Check message mapping
|
||||
if (extension_settings.speech_recognition.messageMappingEnabled) {
|
||||
console.debug(DEBUG_PREFIX+"Start searching message mapping into:",transcriptRaw)
|
||||
for (const key in extension_settings.speech_recognition.messageMapping) {
|
||||
console.debug(DEBUG_PREFIX+"message mapping searching: ", key,"=>",extension_settings.speech_recognition.messageMapping[key]);
|
||||
if (transcriptRaw.includes(key)) {
|
||||
var message = extension_settings.speech_recognition.messageMapping[key];
|
||||
console.debug(DEBUG_PREFIX+"message mapping found: ", key,"=>",extension_settings.speech_recognition.messageMapping[key]);
|
||||
$("#send_textarea").val(message);
|
||||
|
||||
if (messageMode == "auto_send") await getContext().generate();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(function ($) {
|
||||
$.fn.speechRecognitionPlugin = function (options) {
|
||||
const settings = $.extend({
|
||||
grammar: '' // Custom grammar
|
||||
}, options);
|
||||
console.debug(DEBUG_PREFIX+"no message mapping found, processing transcript as normal message");
|
||||
|
||||
const speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
|
||||
const speechRecognitionList = window.SpeechGrammarList || window.webkitSpeechGrammarList;
|
||||
switch (messageMode) {
|
||||
case "auto_send":
|
||||
$('#send_textarea').val("") // clear message area to avoid double message
|
||||
|
||||
if (!speechRecognition) {
|
||||
console.warn('Speech recognition is not supported in this browser.');
|
||||
console.debug(DEBUG_PREFIX+"Sending message")
|
||||
const context = getContext();
|
||||
const messageText = transcriptFormatted;
|
||||
const message = {
|
||||
name: context.name1,
|
||||
is_user: true,
|
||||
is_name: true,
|
||||
send_date: Date.now(),
|
||||
mes: messageText,
|
||||
};
|
||||
context.chat.push(message);
|
||||
context.addOneMessage(message);
|
||||
|
||||
await context.generate();
|
||||
|
||||
$('#debug_output').text("<SST-module DEBUG>: message sent: \""+ transcriptFormatted +"\"");
|
||||
break;
|
||||
|
||||
case "replace":
|
||||
console.debug(DEBUG_PREFIX+"Replacing message")
|
||||
$('#send_textarea').val(transcriptFormatted);
|
||||
break;
|
||||
|
||||
case "append":
|
||||
console.debug(DEBUG_PREFIX+"Appending message")
|
||||
$('#send_textarea').val($('#send_textarea').val()+" "+transcriptFormatted);
|
||||
break;
|
||||
|
||||
default:
|
||||
console.debug(DEBUG_PREFIX+"Not supported stt message mode: "+messageMode)
|
||||
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
console.debug(DEBUG_PREFIX+"Empty transcript, do nothing");
|
||||
}
|
||||
}
|
||||
catch (error) {
|
||||
console.debug(error);
|
||||
}
|
||||
}
|
||||
|
||||
function loadNavigatorAudioRecording() {
|
||||
if (navigator.mediaDevices.getUserMedia) {
|
||||
console.debug(DEBUG_PREFIX+' getUserMedia supported by browser.');
|
||||
|
||||
let onSuccess = function(stream) {
|
||||
const mediaRecorder = new MediaRecorder(stream);
|
||||
|
||||
$("#microphone_button").off('click').on("click", function() {
|
||||
if (!audioRecording) {
|
||||
mediaRecorder.start();
|
||||
console.debug(mediaRecorder.state);
|
||||
console.debug("recorder started");
|
||||
audioRecording = true;
|
||||
$("#microphone_button").toggleClass('fa-microphone fa-microphone-slash');
|
||||
}
|
||||
else {
|
||||
mediaRecorder.stop();
|
||||
console.debug(mediaRecorder.state);
|
||||
console.debug("recorder stopped");
|
||||
audioRecording = false;
|
||||
$("#microphone_button").toggleClass('fa-microphone fa-microphone-slash');
|
||||
}
|
||||
});
|
||||
|
||||
mediaRecorder.onstop = async function() {
|
||||
console.debug(DEBUG_PREFIX+"data available after MediaRecorder.stop() called: ", audioChunks.length, " chunks");
|
||||
const audioBlob = new Blob(audioChunks, { type: "audio/wav; codecs=0" });
|
||||
audioChunks = [];
|
||||
|
||||
const transcript = await sttProvider.processAudio(audioBlob);
|
||||
|
||||
// TODO: lock and release recording while processing?
|
||||
console.debug(DEBUG_PREFIX+"received transcript:", transcript);
|
||||
processTranscript(transcript);
|
||||
}
|
||||
|
||||
mediaRecorder.ondataavailable = function(e) {
|
||||
audioChunks.push(e.data);
|
||||
}
|
||||
}
|
||||
|
||||
let onError = function(err) {
|
||||
console.debug(DEBUG_PREFIX+"The following error occured: " + err);
|
||||
}
|
||||
|
||||
navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
|
||||
|
||||
} else {
|
||||
console.debug(DEBUG_PREFIX+"getUserMedia not supported on your browser!");
|
||||
toastr.error("getUserMedia not supported", DEBUG_PREFIX+"not supported for your browser.", { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true });
|
||||
}
|
||||
}
|
||||
|
||||
//##############//
|
||||
// STT Provider //
|
||||
//##############//
|
||||
|
||||
function loadSttProvider(provider) {
|
||||
//Clear the current config and add new config
|
||||
$("#speech_recognition_provider_settings").html("");
|
||||
|
||||
// Init provider references
|
||||
extension_settings.speech_recognition.currentProvider = provider;
|
||||
sttProviderName = provider;
|
||||
|
||||
if (!(sttProviderName in extension_settings.speech_recognition)) {
|
||||
console.warn(`Provider ${sttProviderName} not in Extension Settings, initiatilizing provider in settings`);
|
||||
extension_settings.speech_recognition[sttProviderName] = {};
|
||||
}
|
||||
|
||||
$('#speech_recognition_provider').val(sttProviderName);
|
||||
|
||||
if (sttProviderName == "None") {
|
||||
$("#microphone_button").hide();
|
||||
$("#speech_recognition_message_mode_div").hide();
|
||||
$("#speech_recognition_message_mapping_div").hide();
|
||||
return;
|
||||
}
|
||||
|
||||
const recognition = new speechRecognition();
|
||||
$("#speech_recognition_message_mode_div").show();
|
||||
$("#speech_recognition_message_mapping_div").show();
|
||||
|
||||
if (settings.grammar && speechRecognitionList) {
|
||||
speechRecognitionList.addFromString(settings.grammar, 1);
|
||||
recognition.grammars = speechRecognitionList;
|
||||
sttProvider = new sttProviders[sttProviderName]
|
||||
|
||||
// Init provider settings
|
||||
$('#speech_recognition_provider_settings').append(sttProvider.settingsHtml);
|
||||
|
||||
// Use microphone button as push to talk
|
||||
if (sttProviderName == "Browser") {
|
||||
sttProvider.processTranscriptFunction = processTranscript;
|
||||
sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
|
||||
}
|
||||
else {
|
||||
sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
|
||||
loadNavigatorAudioRecording();
|
||||
|
||||
$("#microphone_button").show();
|
||||
}
|
||||
}
|
||||
|
||||
recognition.continuous = true;
|
||||
recognition.interimResults = true;
|
||||
// TODO: This should be configurable.
|
||||
recognition.lang = 'en-US'; // Set the language to English (US).
|
||||
function onSttProviderChange() {
|
||||
const sttProviderSelection = $('#speech_recognition_provider').val();
|
||||
loadSttProvider(sttProviderSelection);
|
||||
saveSettingsDebounced();
|
||||
}
|
||||
|
||||
const $textarea = this;
|
||||
const $button = $('<div class="fa-solid fa-microphone speech-toggle" title="Click to speak"></div>');
|
||||
function onSttProviderSettingsInput() {
|
||||
sttProvider.onSettingsChange();
|
||||
|
||||
// Persist changes to SillyTavern stt extension settings
|
||||
extension_settings.speech_recognition[sttProviderName] = sttProvider.settings;
|
||||
saveSettingsDebounced();
|
||||
console.info(`Saved settings ${sttProviderName} ${JSON.stringify(sttProvider.settings)}`);
|
||||
}
|
||||
|
||||
//#############################//
|
||||
// Extension UI and Settings //
|
||||
//#############################//
|
||||
|
||||
const defaultSettings = {
|
||||
currentProvider: "None",
|
||||
messageMode: "append",
|
||||
messageMappingText: "",
|
||||
messageMapping: [],
|
||||
messageMappingEnabled: false
|
||||
}
|
||||
|
||||
function loadSettings() {
|
||||
if (Object.keys(extension_settings.speech_recognition).length === 0) {
|
||||
Object.assign(extension_settings.speech_recognition, defaultSettings)
|
||||
}
|
||||
$('#speech_recognition_enabled').prop('checked',extension_settings.speech_recognition.enabled);
|
||||
$('#speech_recognition_message_mode').val(extension_settings.speech_recognition.messageMode);
|
||||
|
||||
if (extension_settings.speech_recognition.messageMappingText.length > 0) {
|
||||
$('#speech_recognition_message_mapping').val(extension_settings.speech_recognition.messageMappingText);
|
||||
}
|
||||
|
||||
$('#speech_recognition_message_mapping_enabled').prop('checked',extension_settings.speech_recognition.messageMappingEnabled);
|
||||
}
|
||||
|
||||
async function onMessageModeChange() {
|
||||
extension_settings.speech_recognition.messageMode = $('#speech_recognition_message_mode').val();
|
||||
|
||||
if(sttProviderName != "Browser" & extension_settings.speech_recognition.messageMode == "auto_send") {
|
||||
$("#speech_recognition_wait_response_div").show()
|
||||
}
|
||||
else {
|
||||
$("#speech_recognition_wait_response_div").hide()
|
||||
}
|
||||
|
||||
saveSettingsDebounced();
|
||||
}
|
||||
|
||||
async function onMessageMappingChange() {
|
||||
let array = $('#speech_recognition_message_mapping').val().split(",");
|
||||
array = array.map(element => {return element.trim();});
|
||||
array = array.filter((str) => str !== '');
|
||||
extension_settings.speech_recognition.messageMapping = {};
|
||||
for (const text of array) {
|
||||
if (text.includes("=")) {
|
||||
const pair = text.toLowerCase().split("=")
|
||||
extension_settings.speech_recognition.messageMapping[pair[0].trim()] = pair[1].trim()
|
||||
console.debug(DEBUG_PREFIX+"Added mapping", pair[0],"=>", extension_settings.speech_recognition.messageMapping[pair[0]]);
|
||||
}
|
||||
else {
|
||||
console.debug(DEBUG_PREFIX+"Wrong syntax for message mapping, no '=' found in:", text);
|
||||
}
|
||||
}
|
||||
|
||||
$("#speech_recognition_message_mapping_status").text("Message mapping updated to: "+JSON.stringify(extension_settings.speech_recognition.messageMapping))
|
||||
console.debug(DEBUG_PREFIX+"Updated message mapping", extension_settings.speech_recognition.messageMapping);
|
||||
extension_settings.speech_recognition.messageMappingText = $('#speech_recognition_message_mapping').val()
|
||||
saveSettingsDebounced();
|
||||
}
|
||||
|
||||
async function onMessageMappingEnabledClick() {
|
||||
extension_settings.speech_recognition.messageMappingEnabled = $('#speech_recognition_message_mapping_enabled').is(':checked');
|
||||
saveSettingsDebounced()
|
||||
}
|
||||
|
||||
$(document).ready(function () {
|
||||
function addExtensionControls() {
|
||||
const settingsHtml = `
|
||||
<div id="speech_recognition_settings">
|
||||
<div class="inline-drawer">
|
||||
<div class="inline-drawer-toggle inline-drawer-header">
|
||||
<b>Speech Recognition</b>
|
||||
<div class="inline-drawer-icon fa-solid fa-circle-chevron-down down"></div>
|
||||
</div>
|
||||
<div class="inline-drawer-content">
|
||||
<div>
|
||||
<span>Select Speech-to-text Provider</span> </br>
|
||||
<select id="speech_recognition_provider">
|
||||
</select>
|
||||
</div>
|
||||
<div id="speech_recognition_message_mode_div">
|
||||
<span>Message Mode</span> </br>
|
||||
<select id="speech_recognition_message_mode">
|
||||
<option value="append">Append</option>
|
||||
<option value="replace">Replace</option>
|
||||
<option value="auto_send">Auto send</option>
|
||||
</select>
|
||||
</div>
|
||||
<div id="speech_recognition_message_mapping_div">
|
||||
<span>Message Mapping</span>
|
||||
<textarea id="speech_recognition_message_mapping" class="text_pole textarea_compact" type="text" rows="4" placeholder="Enter comma separated phrases mapping, example:\ncommand delete = /del 2,\nslash delete = /del 2,\nsystem roll = /roll 2d6,\nhey continue = /continue"></textarea>
|
||||
<span id="speech_recognition_message_mapping_status"></span>
|
||||
<label class="checkbox_label" for="speech_recognition_message_mapping_enabled">
|
||||
<input type="checkbox" id="speech_recognition_message_mapping_enabled" name="speech_recognition_message_mapping_enabled">
|
||||
<small>Enable messages mapping</small>
|
||||
</label>
|
||||
</div>
|
||||
<form id="speech_recognition_provider_settings" class="inline-drawer-content">
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
$('#extensions_settings').append(settingsHtml);
|
||||
$('#speech_recognition_provider_settings').on('input', onSttProviderSettingsInput);
|
||||
for (const provider in sttProviders) {
|
||||
$('#speech_recognition_provider').append($("<option />").val(provider).text(provider));
|
||||
console.debug(DEBUG_PREFIX+"added option "+provider);
|
||||
}
|
||||
$('#speech_recognition_provider').on('change', onSttProviderChange);
|
||||
$('#speech_recognition_message_mode').on('change', onMessageModeChange);
|
||||
$('#speech_recognition_message_mapping').on('change', onMessageMappingChange);
|
||||
$('#speech_recognition_message_mapping_enabled').on('click', onMessageMappingEnabledClick);
|
||||
|
||||
const $button = $('<div id="microphone_button" class="fa-solid fa-microphone speech-toggle" title="Click to speak"></div>');
|
||||
$('#send_but_sheld').prepend($button);
|
||||
|
||||
let listening = false;
|
||||
$button.on('click', function () {
|
||||
if (listening) {
|
||||
recognition.stop();
|
||||
} else {
|
||||
recognition.start();
|
||||
}
|
||||
listening = !listening;
|
||||
});
|
||||
addExtensionControls(); // No init dependencies
|
||||
loadSettings(); // Depends on Extension Controls and loadTtsProvider
|
||||
loadSttProvider(extension_settings.speech_recognition.currentProvider); // No dependencies
|
||||
|
||||
let initialText = '';
|
||||
|
||||
recognition.onresult = function (speechEvent) {
|
||||
let finalTranscript = '';
|
||||
let interimTranscript = ''
|
||||
|
||||
for (let i = speechEvent.resultIndex; i < speechEvent.results.length; ++i) {
|
||||
const transcript = speechEvent.results[i][0].transcript;
|
||||
|
||||
if (speechEvent.results[i].isFinal) {
|
||||
let interim = capitalizeInterim(transcript);
|
||||
if (interim != '') {
|
||||
let final = finalTranscript;
|
||||
final = composeValues(final, interim) + '.';
|
||||
finalTranscript = final;
|
||||
recognition.abort();
|
||||
listening = false;
|
||||
}
|
||||
interimTranscript = ' ';
|
||||
} else {
|
||||
interimTranscript += transcript;
|
||||
}
|
||||
}
|
||||
|
||||
interimTranscript = capitalizeInterim(interimTranscript);
|
||||
|
||||
$textarea.val(initialText + finalTranscript + interimTranscript);
|
||||
};
|
||||
|
||||
recognition.onerror = function (event) {
|
||||
console.error('Error occurred in recognition:', event.error);
|
||||
};
|
||||
|
||||
recognition.onend = function () {
|
||||
listening = false;
|
||||
$button.toggleClass('fa-microphone fa-microphone-slash');
|
||||
};
|
||||
|
||||
recognition.onstart = function () {
|
||||
initialText = $textarea.val();
|
||||
$button.toggleClass('fa-microphone fa-microphone-slash');
|
||||
};
|
||||
};
|
||||
}(jQuery));
|
||||
|
||||
jQuery(() => {
|
||||
const $textarea = $('#send_textarea');
|
||||
$textarea.speechRecognitionPlugin();
|
||||
});
|
||||
//const wrapper = new ModuleWorkerWrapper(moduleWorker);
|
||||
//setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things
|
||||
//moduleWorker();
|
||||
})
|
||||
|
|
|
@ -2,10 +2,13 @@
|
|||
"display_name": "Speech Recognition",
|
||||
"loading_order": 13,
|
||||
"requires": [],
|
||||
"optional": [],
|
||||
"optional": [
|
||||
"vosk-speech-recognition",
|
||||
"whisper-speech-recognition"
|
||||
],
|
||||
"js": "index.js",
|
||||
"css": "style.css",
|
||||
"author": "Cohee#1207",
|
||||
"version": "1.0.0",
|
||||
"author": "Cohee#1207 and Keij#6799",
|
||||
"version": "1.1.0",
|
||||
"homePage": "https://github.com/SillyTavern/SillyTavern"
|
||||
}
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
import { getApiUrl, doExtrasFetch } from "../../extensions.js";
|
||||
export { VoskSttProvider }
|
||||
|
||||
const DEBUG_PREFIX = "<Speech Recognition module (Vosk)> "
|
||||
|
||||
class VoskSttProvider {
|
||||
//########//
|
||||
// Config //
|
||||
//########//
|
||||
|
||||
settings
|
||||
|
||||
defaultSettings = {
|
||||
}
|
||||
|
||||
get settingsHtml() {
|
||||
let html = ""
|
||||
return html
|
||||
}
|
||||
|
||||
onSettingsChange() {
|
||||
// Used when provider settings are updated from UI
|
||||
}
|
||||
|
||||
loadSettings(settings) {
|
||||
// Populate Provider UI given input settings
|
||||
if (Object.keys(settings).length == 0) {
|
||||
console.debug(DEBUG_PREFIX+"Using default vosk STT extension settings")
|
||||
}
|
||||
|
||||
// Only accept keys defined in defaultSettings
|
||||
this.settings = this.defaultSettings
|
||||
|
||||
for (const key in settings){
|
||||
if (key in this.settings){
|
||||
this.settings[key] = settings[key]
|
||||
} else {
|
||||
throw `Invalid setting passed to STT extension: ${key}`
|
||||
}
|
||||
}
|
||||
|
||||
console.debug(DEBUG_PREFIX+"Vosk STT settings loaded")
|
||||
}
|
||||
|
||||
async processAudio(audioblob) {
|
||||
var requestData = new FormData();
|
||||
requestData.append('AudioFile', audioblob, 'record.wav');
|
||||
|
||||
const url = new URL(getApiUrl());
|
||||
url.pathname = '/api/speech-recognition/vosk/process-audio';
|
||||
|
||||
const apiResult = await doExtrasFetch(url, {
|
||||
method: 'POST',
|
||||
body: requestData,
|
||||
});
|
||||
|
||||
if (!apiResult.ok) {
|
||||
toastr.error(apiResult.statusText, 'STT Generation Failed (Vosk)', { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true });
|
||||
throw new Error(`HTTP ${apiResult.status}: ${await apiResult.text()}`);
|
||||
}
|
||||
|
||||
const result = await apiResult.json();
|
||||
return result.transcript;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
import { getApiUrl, doExtrasFetch } from "../../extensions.js";
|
||||
export { WhisperSttProvider }
|
||||
|
||||
const DEBUG_PREFIX = "<Speech Recognition module (Vosk)> "
|
||||
|
||||
class WhisperSttProvider {
|
||||
//########//
|
||||
// Config //
|
||||
//########//
|
||||
|
||||
settings
|
||||
|
||||
defaultSettings = {
|
||||
//model_path: "",
|
||||
}
|
||||
|
||||
get settingsHtml() {
|
||||
let html = ""
|
||||
return html
|
||||
}
|
||||
|
||||
onSettingsChange() {
|
||||
// Used when provider settings are updated from UI
|
||||
}
|
||||
|
||||
loadSettings(settings) {
|
||||
// Populate Provider UI given input settings
|
||||
if (Object.keys(settings).length == 0) {
|
||||
console.debug(DEBUG_PREFIX+"Using default Whisper STT extension settings")
|
||||
}
|
||||
|
||||
// Only accept keys defined in defaultSettings
|
||||
this.settings = this.defaultSettings
|
||||
|
||||
for (const key in settings){
|
||||
if (key in this.settings){
|
||||
this.settings[key] = settings[key]
|
||||
} else {
|
||||
throw `Invalid setting passed to STT extension: ${key}`
|
||||
}
|
||||
}
|
||||
|
||||
console.debug(DEBUG_PREFIX+"Whisper STT settings loaded")
|
||||
}
|
||||
|
||||
async processAudio(audioblob) {
|
||||
var requestData = new FormData();
|
||||
requestData.append('AudioFile', audioblob, 'record.wav');
|
||||
|
||||
const url = new URL(getApiUrl());
|
||||
url.pathname = '/api/speech-recognition/whisper/process-audio';
|
||||
|
||||
const apiResult = await doExtrasFetch(url, {
|
||||
method: 'POST',
|
||||
body: requestData,
|
||||
});
|
||||
|
||||
if (!apiResult.ok) {
|
||||
toastr.error(apiResult.statusText, 'STT Generation Failed (Whisper)', { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true });
|
||||
throw new Error(`HTTP ${apiResult.status}: ${await apiResult.text()}`);
|
||||
}
|
||||
|
||||
const result = await apiResult.json();
|
||||
return result.transcript;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue