From 192c82b1800cf12b2b2e8c875102c22f1940adb7 Mon Sep 17 00:00:00 2001 From: Tony Ribeiro Date: Mon, 31 Jul 2023 18:47:33 +0200 Subject: [PATCH] Restored streaming mode as a new provider "Streaming", recording is done on server side, voice detection with vosk and transcript with whisper. --- .../extensions/speech-recognition/index.js | 108 ++++++++++++++++-- .../speech-recognition/streaming.js | 102 +++++++++++++++++ 2 files changed, 200 insertions(+), 10 deletions(-) create mode 100644 public/scripts/extensions/speech-recognition/streaming.js diff --git a/public/scripts/extensions/speech-recognition/index.js b/public/scripts/extensions/speech-recognition/index.js index e32a9a3db..e5b0ae116 100644 --- a/public/scripts/extensions/speech-recognition/index.js +++ b/public/scripts/extensions/speech-recognition/index.js @@ -8,16 +8,21 @@ import { getContext, getApiUrl, modules, extension_settings, ModuleWorkerWrapper import { VoskSttProvider } from './vosk.js' import { WhisperSttProvider } from './whisper.js' import { BrowserSttProvider } from './browser.js' +import { StreamingSttProvider } from './streaming.js' export { MODULE_NAME }; const MODULE_NAME = 'Speech Recognition'; const DEBUG_PREFIX = " " +const UPDATE_INTERVAL = 100; + +let inApiCall = false; let sttProviders = { None: null, Browser: BrowserSttProvider, Whisper: WhisperSttProvider, Vosk: VoskSttProvider, + Streaming: StreamingSttProvider, } let sttProvider = null @@ -27,6 +32,82 @@ let audioRecording = false const constraints = { audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } }; let audioChunks = []; +async function moduleWorker() { + if (sttProviderName != "Streaming") { + return; + } + + // API is busy + if (inApiCall) { + return; + } + + try { + inApiCall = true; + const userMessageOriginal = await sttProvider.getUserMessage(); + let userMessageFormatted = userMessageOriginal.trim(); + + if (userMessageFormatted.length > 0) + { + console.debug(DEBUG_PREFIX+"recorded transcript: \""+userMessageFormatted+"\""); + + let userMessageLower = userMessageFormatted.toLowerCase(); + // remove punctuation + let userMessageRaw = userMessageLower.replace(/[^\w\s\']|_/g, "").replace(/\s+/g, " "); + + console.debug(DEBUG_PREFIX+"raw transcript:",userMessageRaw); + + // Detect trigger words + let messageStart = -1; + + if (extension_settings.speech_recognition.Streaming.triggerWordsEnabled) { + + for (const triggerWord of extension_settings.speech_recognition.Streaming.triggerWords) { + const triggerPos = userMessageRaw.indexOf(triggerWord.toLowerCase()); + + // Trigger word not found or not starting message and just a substring + if (triggerPos == -1){ // | (triggerPos > 0 & userMessageFormatted[triggerPos-1] != " ")) { + console.debug(DEBUG_PREFIX+"trigger word not found: ", triggerWord); + } + else { + console.debug(DEBUG_PREFIX+"Found trigger word: ", triggerWord, " at index ", triggerPos); + if (triggerPos < messageStart | messageStart == -1) { // & (triggerPos + triggerWord.length) < userMessageFormatted.length)) { + messageStart = triggerPos; // + triggerWord.length + 1; + } + } + } + } else { + messageStart = 0; + } + + if (messageStart == -1) { + console.debug(DEBUG_PREFIX+"message ignored, no trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\""); + if (extension_settings.speech_recognition.Streaming.debug) { + toastr.info( + "No trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\"", + DEBUG_PREFIX+"message ignored.", + { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true }, + ); + } + } + else{ + userMessageFormatted = userMessageFormatted.substring(messageStart); + processTranscript(userMessageFormatted); + } + } + else + { + console.debug(DEBUG_PREFIX+"Received empty transcript, ignored"); + } + } + catch (error) { + console.debug(error); + } + finally { + inApiCall = false; + } +} + async function processTranscript(transcript) { try { const transcriptOriginal = transcript; @@ -198,13 +279,21 @@ function loadSttProvider(provider) { if (sttProviderName == "Browser") { sttProvider.processTranscriptFunction = processTranscript; sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); - } - else { - sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); - loadNavigatorAudioRecording(); - $("#microphone_button").show(); } + + if (sttProviderName == "Vosk" | sttProviderName == "Whisper") { + sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); + loadNavigatorAudioRecording(); + $("#microphone_button").show(); + } + + if (sttProviderName == "Streaming") { + sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); + $("#microphone_button").off('click'); + $("#microphone_button").hide(); + } + } function onSttProviderChange() { @@ -231,7 +320,7 @@ const defaultSettings = { messageMode: "append", messageMappingText: "", messageMapping: [], - messageMappingEnabled: false + messageMappingEnabled: false, } function loadSettings() { @@ -344,8 +433,7 @@ $(document).ready(function () { addExtensionControls(); // No init dependencies loadSettings(); // Depends on Extension Controls and loadTtsProvider loadSttProvider(extension_settings.speech_recognition.currentProvider); // No dependencies - - //const wrapper = new ModuleWorkerWrapper(moduleWorker); - //setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things - //moduleWorker(); + const wrapper = new ModuleWorkerWrapper(moduleWorker); + setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things + moduleWorker(); }) diff --git a/public/scripts/extensions/speech-recognition/streaming.js b/public/scripts/extensions/speech-recognition/streaming.js new file mode 100644 index 000000000..0ef41235c --- /dev/null +++ b/public/scripts/extensions/speech-recognition/streaming.js @@ -0,0 +1,102 @@ +import { getApiUrl, doExtrasFetch, modules } from "../../extensions.js"; +export { StreamingSttProvider } + +const DEBUG_PREFIX = " " + +class StreamingSttProvider { + //########// + // Config // + //########// + + settings + + defaultSettings = { + triggerWordsText: "", + triggerWords : [], + triggerWordsEnabled : false, + debug : false, + } + + get settingsHtml() { + let html = '\ +
\ + Trigger words\ + \ + \ + \ +
\ + ' + return html + } + + onSettingsChange() { + this.settings.triggerWordsText = $('#speech_recognition_streaming_trigger_words').val(); + let array = $('#speech_recognition_streaming_trigger_words').val().split(","); + array = array.map(element => {return element.trim().toLowerCase();}); + array = array.filter((str) => str !== ''); + this.settings.triggerWords = array; + this.settings.triggerWordsEnabled = $("#speech_recognition_streaming_trigger_words_enabled").is(':checked'); + this.settings.debug = $("#speech_recognition_streaming_debug").is(':checked'); + console.debug(DEBUG_PREFIX+" Updated settings: ", this.settings); + this.loadSettings(this.settings); + } + + loadSettings(settings) { + // Populate Provider UI given input settings + if (Object.keys(settings).length == 0) { + console.debug(DEBUG_PREFIX+"Using default Whisper STT extension settings") + } + + // Only accept keys defined in defaultSettings + this.settings = this.defaultSettings + + for (const key in settings){ + if (key in this.settings){ + this.settings[key] = settings[key] + } else { + throw `Invalid setting passed to STT extension: ${key}` + } + } + + $("#speech_recognition_streaming_trigger_words").val(this.settings.triggerWordsText); + $("#speech_recognition_streaming_trigger_words_enabled").prop('checked',this.settings.triggerWordsEnabled); + $("#speech_recognition_streaming_debug").prop('checked',this.settings.debug); + + console.debug(DEBUG_PREFIX+"streaming STT settings loaded") + } + + async getUserMessage() { + // Return if module is not loaded + if (!modules.includes('streaming-stt')) { + console.debug(DEBUG_PREFIX+"Module streaming-stt must be activated in Sillytavern Extras for streaming user voice.") + return ""; + } + + const url = new URL(getApiUrl()); + url.pathname = '/api/speech-recognition/streaming/record-and-transcript'; + + const apiResult = await doExtrasFetch(url, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Bypass-Tunnel-Reminder': 'bypass', + }, + body: JSON.stringify({ text: "" }), + }); + + if (!apiResult.ok) { + toastr.error(apiResult.statusText, DEBUG_PREFIX+'STT Generation Failed (streaming)', { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true }); + throw new Error(`HTTP ${apiResult.status}: ${await apiResult.text()}`); + } + + const data = await apiResult.json(); + return data.transcript; + } + +} \ No newline at end of file