From 8d794ed03f25de53e1d886a2dbf16a24ace3dbb4 Mon Sep 17 00:00:00 2001 From: Tony Ribeiro Date: Thu, 27 Jul 2023 19:29:36 +0200 Subject: [PATCH] merge new speech-recognition module with staging branch. --- public/scripts/extensions.js | 1 + .../extensions/speech-recognition/browser.js | 233 ++++++++++ .../extensions/speech-recognition/index.js | 433 ++++++++++++++---- .../speech-recognition/manifest.json | 9 +- .../extensions/speech-recognition/vosk.js | 65 +++ .../extensions/speech-recognition/whisper.js | 67 +++ 6 files changed, 709 insertions(+), 99 deletions(-) create mode 100644 public/scripts/extensions/speech-recognition/browser.js create mode 100644 public/scripts/extensions/speech-recognition/vosk.js create mode 100644 public/scripts/extensions/speech-recognition/whisper.js diff --git a/public/scripts/extensions.js b/public/scripts/extensions.js index 4101f5c02..ca0548f32 100644 --- a/public/scripts/extensions.js +++ b/public/scripts/extensions.js @@ -73,6 +73,7 @@ const extension_settings = { fluctuation: 0.1, enabled: false, }, + speech_recognition: {}, }; let modules = []; diff --git a/public/scripts/extensions/speech-recognition/browser.js b/public/scripts/extensions/speech-recognition/browser.js new file mode 100644 index 000000000..f51019894 --- /dev/null +++ b/public/scripts/extensions/speech-recognition/browser.js @@ -0,0 +1,233 @@ +// Borrowed from Agnai (AGPLv3) +// https://github.com/agnaistic/agnai/blob/dev/web/pages/Chat/components/SpeechRecognitionRecorder.tsx +// First version by Cohee#1207 +// Adapted by Tony-sama + +export { BrowserSttProvider } + +const DEBUG_PREFIX = " " + +class BrowserSttProvider { + //########// + // Config // + //########// + + settings = { + language: "" + } + + defaultSettings = { + language: "en-US", + } + + processTranscriptFunction = null; + + get settingsHtml() { + let html = ' \ + Language
\ + \ + ' + return html + } + + onSettingsChange() { + // Used when provider settings are updated from UI + this.settings.language = $("#speech_recognition_browser_provider_language").val(); + console.debug(DEBUG_PREFIX+"Change language to",this.settings.language); + this.loadSettings(this.settings); + } + + static capitalizeInterim(interimTranscript) { + let capitalizeIndex = -1; + if (interimTranscript.length > 2 && interimTranscript[0] === ' ') capitalizeIndex = 1; + else if (interimTranscript.length > 1) capitalizeIndex = 0; + if (capitalizeIndex > -1) { + const spacing = capitalizeIndex > 0 ? ' '.repeat(capitalizeIndex - 1) : ''; + const capitalized = interimTranscript[capitalizeIndex].toLocaleUpperCase(); + const rest = interimTranscript.substring(capitalizeIndex + 1); + interimTranscript = spacing + capitalized + rest; + } + return interimTranscript; + } + + static composeValues(previous, interim) { + let spacing = ''; + if (previous.endsWith('.')) spacing = ' '; + return previous + spacing + interim; + } + + loadSettings(settings) { + const processTranscript = this.processTranscriptFunction; + + // Populate Provider UI given input settings + if (Object.keys(settings).length == 0) { + console.debug(DEBUG_PREFIX+"Using default browser STT settings") + } + + // Initialise as defaultSettings + this.settings = this.defaultSettings; + + for (const key in settings){ + if (key in this.settings){ + this.settings[key] = settings[key] + } else { + throw `Invalid setting passed to Speech recogniton extension (browser): ${key}` + } + } + + $("#speech_recognition_browser_provider_language").val(this.settings.language); + + const speechRecognitionSettings = $.extend({ + grammar: '' // Custom grammar + }, options); + + const speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; + const speechRecognitionList = window.SpeechGrammarList || window.webkitSpeechGrammarList; + + if (!speechRecognition) { + console.warn(DEBUG_PREFIX+'Speech recognition is not supported in this browser.'); + $("#microphone_button").hide(); + toastr.error("Speech recognition is not supported in this browser, use another browser or another provider of SillyTavern-extras Speech recognition extension.", "Speech recognition activation Failed (Browser)", { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true }); + return; + } + + const recognition = new speechRecognition(); + + if (speechRecognitionSettings.grammar && speechRecognitionList) { + speechRecognitionList.addFromString(speechRecognitionSettings.grammar, 1); + recognition.grammars = speechRecognitionList; + } + + recognition.continuous = true; + recognition.interimResults = true; + recognition.lang = this.settings.language; + + const textarea = $('#send_textarea'); + const button = $('#microphone_button'); + + let listening = false; + button.off('click').on("click", function () { + if (listening) { + recognition.stop(); + } else { + recognition.start(); + } + listening = !listening; + }); + + let initialText = ''; + + recognition.onresult = function (speechEvent) { + let finalTranscript = ''; + let interimTranscript = '' + + for (let i = speechEvent.resultIndex; i < speechEvent.results.length; ++i) { + const transcript = speechEvent.results[i][0].transcript; + + if (speechEvent.results[i].isFinal) { + let interim = BrowserSttProvider.capitalizeInterim(transcript); + if (interim != '') { + let final = finalTranscript; + final = BrowserSttProvider.composeValues(final, interim); + if (final.slice(-1) != '.' & final.slice(-1) != '?') final += '.'; + finalTranscript = final; + recognition.abort(); + listening = false; + } + interimTranscript = ' '; + } else { + interimTranscript += transcript; + } + } + + interimTranscript = BrowserSttProvider.capitalizeInterim(interimTranscript); + + textarea.val(initialText + finalTranscript + interimTranscript); + }; + + recognition.onerror = function (event) { + console.error('Error occurred in recognition:', event.error); + //if ($('#speech_recognition_debug').is(':checked')) + // toastr.error('Error occurred in recognition:'+ event.error, 'STT Generation error (Browser)', { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true }); + }; + + recognition.onend = function () { + listening = false; + button.toggleClass('fa-microphone fa-microphone-slash'); + const newText = textarea.val().substring(initialText.length); + textarea.val(textarea.val().substring(0,initialText.length)); + processTranscript(newText); + + }; + + recognition.onstart = function () { + initialText = textarea.val(); + button.toggleClass('fa-microphone fa-microphone-slash'); + + if ($("#speech_recognition_message_mode").val() == "replace") { + textarea.val(""); + initialText = "" + } + }; + + $("#microphone_button").show(); + + console.debug(DEBUG_PREFIX+"Browser STT settings loaded") + } + + +} diff --git a/public/scripts/extensions/speech-recognition/index.js b/public/scripts/extensions/speech-recognition/index.js index b306bf690..e32a9a3db 100644 --- a/public/scripts/extensions/speech-recognition/index.js +++ b/public/scripts/extensions/speech-recognition/index.js @@ -1,110 +1,351 @@ -// Borrowed from Agnai (AGPLv3) -// https://github.com/agnaistic/agnai/blob/dev/web/pages/Chat/components/SpeechRecognitionRecorder.tsx -function capitalizeInterim(interimTranscript) { - let capitalizeIndex = -1; - if (interimTranscript.length > 2 && interimTranscript[0] === ' ') capitalizeIndex = 1; - else if (interimTranscript.length > 1) capitalizeIndex = 0; - if (capitalizeIndex > -1) { - const spacing = capitalizeIndex > 0 ? ' '.repeat(capitalizeIndex - 1) : ''; - const capitalized = interimTranscript[capitalizeIndex].toLocaleUpperCase(); - const rest = interimTranscript.substring(capitalizeIndex + 1); - interimTranscript = spacing + capitalized + rest; +/* +TODO: + - try pseudo streaming audio by just sending chunk every X seconds and asking VOSK if it is full text. +*/ + +import { saveSettingsDebounced } from "../../../script.js"; +import { getContext, getApiUrl, modules, extension_settings, ModuleWorkerWrapper, doExtrasFetch } from "../../extensions.js"; +import { VoskSttProvider } from './vosk.js' +import { WhisperSttProvider } from './whisper.js' +import { BrowserSttProvider } from './browser.js' +export { MODULE_NAME }; + +const MODULE_NAME = 'Speech Recognition'; +const DEBUG_PREFIX = " " + +let sttProviders = { + None: null, + Browser: BrowserSttProvider, + Whisper: WhisperSttProvider, + Vosk: VoskSttProvider, +} + +let sttProvider = null +let sttProviderName = "None" + +let audioRecording = false +const constraints = { audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } }; +let audioChunks = []; + +async function processTranscript(transcript) { + try { + const transcriptOriginal = transcript; + let transcriptFormatted = transcriptOriginal.trim(); + + if (transcriptFormatted.length > 0) + { + console.debug(DEBUG_PREFIX+"recorded transcript: \""+transcriptFormatted+"\""); + const messageMode = extension_settings.speech_recognition.messageMode; + console.debug(DEBUG_PREFIX+"mode: "+messageMode); + + let transcriptLower = transcriptFormatted.toLowerCase() + // remove punctuation + let transcriptRaw = transcriptLower.replace(/[^\w\s\']|_/g, "").replace(/\s+/g, " "); + + // Check message mapping + if (extension_settings.speech_recognition.messageMappingEnabled) { + console.debug(DEBUG_PREFIX+"Start searching message mapping into:",transcriptRaw) + for (const key in extension_settings.speech_recognition.messageMapping) { + console.debug(DEBUG_PREFIX+"message mapping searching: ", key,"=>",extension_settings.speech_recognition.messageMapping[key]); + if (transcriptRaw.includes(key)) { + var message = extension_settings.speech_recognition.messageMapping[key]; + console.debug(DEBUG_PREFIX+"message mapping found: ", key,"=>",extension_settings.speech_recognition.messageMapping[key]); + $("#send_textarea").val(message); + + if (messageMode == "auto_send") await getContext().generate(); + return; + } + } + } + + console.debug(DEBUG_PREFIX+"no message mapping found, processing transcript as normal message"); + + switch (messageMode) { + case "auto_send": + $('#send_textarea').val("") // clear message area to avoid double message + + console.debug(DEBUG_PREFIX+"Sending message") + const context = getContext(); + const messageText = transcriptFormatted; + const message = { + name: context.name1, + is_user: true, + is_name: true, + send_date: Date.now(), + mes: messageText, + }; + context.chat.push(message); + context.addOneMessage(message); + + await context.generate(); + + $('#debug_output').text(": message sent: \""+ transcriptFormatted +"\""); + break; + + case "replace": + console.debug(DEBUG_PREFIX+"Replacing message") + $('#send_textarea').val(transcriptFormatted); + break; + + case "append": + console.debug(DEBUG_PREFIX+"Appending message") + $('#send_textarea').val($('#send_textarea').val()+" "+transcriptFormatted); + break; + + default: + console.debug(DEBUG_PREFIX+"Not supported stt message mode: "+messageMode) + + } + } + else + { + console.debug(DEBUG_PREFIX+"Empty transcript, do nothing"); + } + } + catch (error) { + console.debug(error); } - return interimTranscript; } -function composeValues(previous, interim) { - let spacing = ''; - if (previous.endsWith('.')) spacing = ' '; - return previous + spacing + interim; +function loadNavigatorAudioRecording() { + if (navigator.mediaDevices.getUserMedia) { + console.debug(DEBUG_PREFIX+' getUserMedia supported by browser.'); + + let onSuccess = function(stream) { + const mediaRecorder = new MediaRecorder(stream); + + $("#microphone_button").off('click').on("click", function() { + if (!audioRecording) { + mediaRecorder.start(); + console.debug(mediaRecorder.state); + console.debug("recorder started"); + audioRecording = true; + $("#microphone_button").toggleClass('fa-microphone fa-microphone-slash'); + } + else { + mediaRecorder.stop(); + console.debug(mediaRecorder.state); + console.debug("recorder stopped"); + audioRecording = false; + $("#microphone_button").toggleClass('fa-microphone fa-microphone-slash'); + } + }); + + mediaRecorder.onstop = async function() { + console.debug(DEBUG_PREFIX+"data available after MediaRecorder.stop() called: ", audioChunks.length, " chunks"); + const audioBlob = new Blob(audioChunks, { type: "audio/wav; codecs=0" }); + audioChunks = []; + + const transcript = await sttProvider.processAudio(audioBlob); + + // TODO: lock and release recording while processing? + console.debug(DEBUG_PREFIX+"received transcript:", transcript); + processTranscript(transcript); + } + + mediaRecorder.ondataavailable = function(e) { + audioChunks.push(e.data); + } + } + + let onError = function(err) { + console.debug(DEBUG_PREFIX+"The following error occured: " + err); + } + + navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError); + + } else { + console.debug(DEBUG_PREFIX+"getUserMedia not supported on your browser!"); + toastr.error("getUserMedia not supported", DEBUG_PREFIX+"not supported for your browser.", { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true }); + } } -(function ($) { - $.fn.speechRecognitionPlugin = function (options) { - const settings = $.extend({ - grammar: '' // Custom grammar - }, options); +//##############// +// STT Provider // +//##############// - const speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; - const speechRecognitionList = window.SpeechGrammarList || window.webkitSpeechGrammarList; +function loadSttProvider(provider) { + //Clear the current config and add new config + $("#speech_recognition_provider_settings").html(""); - if (!speechRecognition) { - console.warn('Speech recognition is not supported in this browser.'); - return; + // Init provider references + extension_settings.speech_recognition.currentProvider = provider; + sttProviderName = provider; + + if (!(sttProviderName in extension_settings.speech_recognition)) { + console.warn(`Provider ${sttProviderName} not in Extension Settings, initiatilizing provider in settings`); + extension_settings.speech_recognition[sttProviderName] = {}; + } + + $('#speech_recognition_provider').val(sttProviderName); + + if (sttProviderName == "None") { + $("#microphone_button").hide(); + $("#speech_recognition_message_mode_div").hide(); + $("#speech_recognition_message_mapping_div").hide(); + return; + } + + $("#speech_recognition_message_mode_div").show(); + $("#speech_recognition_message_mapping_div").show(); + + sttProvider = new sttProviders[sttProviderName] + + // Init provider settings + $('#speech_recognition_provider_settings').append(sttProvider.settingsHtml); + + // Use microphone button as push to talk + if (sttProviderName == "Browser") { + sttProvider.processTranscriptFunction = processTranscript; + sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); + } + else { + sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); + loadNavigatorAudioRecording(); + + $("#microphone_button").show(); + } +} + +function onSttProviderChange() { + const sttProviderSelection = $('#speech_recognition_provider').val(); + loadSttProvider(sttProviderSelection); + saveSettingsDebounced(); +} + +function onSttProviderSettingsInput() { + sttProvider.onSettingsChange(); + + // Persist changes to SillyTavern stt extension settings + extension_settings.speech_recognition[sttProviderName] = sttProvider.settings; + saveSettingsDebounced(); + console.info(`Saved settings ${sttProviderName} ${JSON.stringify(sttProvider.settings)}`); +} + +//#############################// +// Extension UI and Settings // +//#############################// + +const defaultSettings = { + currentProvider: "None", + messageMode: "append", + messageMappingText: "", + messageMapping: [], + messageMappingEnabled: false +} + +function loadSettings() { + if (Object.keys(extension_settings.speech_recognition).length === 0) { + Object.assign(extension_settings.speech_recognition, defaultSettings) + } + $('#speech_recognition_enabled').prop('checked',extension_settings.speech_recognition.enabled); + $('#speech_recognition_message_mode').val(extension_settings.speech_recognition.messageMode); + + if (extension_settings.speech_recognition.messageMappingText.length > 0) { + $('#speech_recognition_message_mapping').val(extension_settings.speech_recognition.messageMappingText); + } + + $('#speech_recognition_message_mapping_enabled').prop('checked',extension_settings.speech_recognition.messageMappingEnabled); +} + +async function onMessageModeChange() { + extension_settings.speech_recognition.messageMode = $('#speech_recognition_message_mode').val(); + + if(sttProviderName != "Browser" & extension_settings.speech_recognition.messageMode == "auto_send") { + $("#speech_recognition_wait_response_div").show() + } + else { + $("#speech_recognition_wait_response_div").hide() + } + + saveSettingsDebounced(); +} + +async function onMessageMappingChange() { + let array = $('#speech_recognition_message_mapping').val().split(","); + array = array.map(element => {return element.trim();}); + array = array.filter((str) => str !== ''); + extension_settings.speech_recognition.messageMapping = {}; + for (const text of array) { + if (text.includes("=")) { + const pair = text.toLowerCase().split("=") + extension_settings.speech_recognition.messageMapping[pair[0].trim()] = pair[1].trim() + console.debug(DEBUG_PREFIX+"Added mapping", pair[0],"=>", extension_settings.speech_recognition.messageMapping[pair[0]]); } - - const recognition = new speechRecognition(); - - if (settings.grammar && speechRecognitionList) { - speechRecognitionList.addFromString(settings.grammar, 1); - recognition.grammars = speechRecognitionList; + else { + console.debug(DEBUG_PREFIX+"Wrong syntax for message mapping, no '=' found in:", text); } + } + + $("#speech_recognition_message_mapping_status").text("Message mapping updated to: "+JSON.stringify(extension_settings.speech_recognition.messageMapping)) + console.debug(DEBUG_PREFIX+"Updated message mapping", extension_settings.speech_recognition.messageMapping); + extension_settings.speech_recognition.messageMappingText = $('#speech_recognition_message_mapping').val() + saveSettingsDebounced(); +} - recognition.continuous = true; - recognition.interimResults = true; - // TODO: This should be configurable. - recognition.lang = 'en-US'; // Set the language to English (US). +async function onMessageMappingEnabledClick() { + extension_settings.speech_recognition.messageMappingEnabled = $('#speech_recognition_message_mapping_enabled').is(':checked'); + saveSettingsDebounced() +} - const $textarea = this; - const $button = $('
'); +$(document).ready(function () { + function addExtensionControls() { + const settingsHtml = ` +
+
+
+ Speech Recognition +
+
+
+
+ Select Speech-to-text Provider
+ +
+
+ Message Mode
+ +
+
+ Message Mapping + + + +
+
+
+
+
+
+ `; + $('#extensions_settings').append(settingsHtml); + $('#speech_recognition_provider_settings').on('input', onSttProviderSettingsInput); + for (const provider in sttProviders) { + $('#speech_recognition_provider').append($("