From 2997522c5251328e94b255a91ebb1b93a8755092 Mon Sep 17 00:00:00 2001 From: Tony Ribeiro Date: Fri, 13 Oct 2023 01:34:00 +0200 Subject: [PATCH] Removed speech recognition extension (now third party extension download from assets menu). --- .../extensions/speech-recognition/browser.js | 233 --------- .../extensions/speech-recognition/index.js | 452 ------------------ .../speech-recognition/manifest.json | 14 - .../speech-recognition/streaming.js | 109 ----- .../extensions/speech-recognition/style.css | 3 - .../extensions/speech-recognition/vosk.js | 65 --- .../extensions/speech-recognition/whisper.js | 67 --- 7 files changed, 943 deletions(-) delete mode 100644 public/scripts/extensions/speech-recognition/browser.js delete mode 100644 public/scripts/extensions/speech-recognition/index.js delete mode 100644 public/scripts/extensions/speech-recognition/manifest.json delete mode 100644 public/scripts/extensions/speech-recognition/streaming.js delete mode 100644 public/scripts/extensions/speech-recognition/style.css delete mode 100644 public/scripts/extensions/speech-recognition/vosk.js delete mode 100644 public/scripts/extensions/speech-recognition/whisper.js diff --git a/public/scripts/extensions/speech-recognition/browser.js b/public/scripts/extensions/speech-recognition/browser.js deleted file mode 100644 index f51019894..000000000 --- a/public/scripts/extensions/speech-recognition/browser.js +++ /dev/null @@ -1,233 +0,0 @@ -// Borrowed from Agnai (AGPLv3) -// https://github.com/agnaistic/agnai/blob/dev/web/pages/Chat/components/SpeechRecognitionRecorder.tsx -// First version by Cohee#1207 -// Adapted by Tony-sama - -export { BrowserSttProvider } - -const DEBUG_PREFIX = " " - -class BrowserSttProvider { - //########// - // Config // - //########// - - settings = { - language: "" - } - - defaultSettings = { - language: "en-US", - } - - processTranscriptFunction = null; - - get settingsHtml() { - let html = ' \ - Language
\ - \ - ' - return html - } - - onSettingsChange() { - // Used when provider settings are updated from UI - this.settings.language = $("#speech_recognition_browser_provider_language").val(); - console.debug(DEBUG_PREFIX+"Change language to",this.settings.language); - this.loadSettings(this.settings); - } - - static capitalizeInterim(interimTranscript) { - let capitalizeIndex = -1; - if (interimTranscript.length > 2 && interimTranscript[0] === ' ') capitalizeIndex = 1; - else if (interimTranscript.length > 1) capitalizeIndex = 0; - if (capitalizeIndex > -1) { - const spacing = capitalizeIndex > 0 ? ' '.repeat(capitalizeIndex - 1) : ''; - const capitalized = interimTranscript[capitalizeIndex].toLocaleUpperCase(); - const rest = interimTranscript.substring(capitalizeIndex + 1); - interimTranscript = spacing + capitalized + rest; - } - return interimTranscript; - } - - static composeValues(previous, interim) { - let spacing = ''; - if (previous.endsWith('.')) spacing = ' '; - return previous + spacing + interim; - } - - loadSettings(settings) { - const processTranscript = this.processTranscriptFunction; - - // Populate Provider UI given input settings - if (Object.keys(settings).length == 0) { - console.debug(DEBUG_PREFIX+"Using default browser STT settings") - } - - // Initialise as defaultSettings - this.settings = this.defaultSettings; - - for (const key in settings){ - if (key in this.settings){ - this.settings[key] = settings[key] - } else { - throw `Invalid setting passed to Speech recogniton extension (browser): ${key}` - } - } - - $("#speech_recognition_browser_provider_language").val(this.settings.language); - - const speechRecognitionSettings = $.extend({ - grammar: '' // Custom grammar - }, options); - - const speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; - const speechRecognitionList = window.SpeechGrammarList || window.webkitSpeechGrammarList; - - if (!speechRecognition) { - console.warn(DEBUG_PREFIX+'Speech recognition is not supported in this browser.'); - $("#microphone_button").hide(); - toastr.error("Speech recognition is not supported in this browser, use another browser or another provider of SillyTavern-extras Speech recognition extension.", "Speech recognition activation Failed (Browser)", { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true }); - return; - } - - const recognition = new speechRecognition(); - - if (speechRecognitionSettings.grammar && speechRecognitionList) { - speechRecognitionList.addFromString(speechRecognitionSettings.grammar, 1); - recognition.grammars = speechRecognitionList; - } - - recognition.continuous = true; - recognition.interimResults = true; - recognition.lang = this.settings.language; - - const textarea = $('#send_textarea'); - const button = $('#microphone_button'); - - let listening = false; - button.off('click').on("click", function () { - if (listening) { - recognition.stop(); - } else { - recognition.start(); - } - listening = !listening; - }); - - let initialText = ''; - - recognition.onresult = function (speechEvent) { - let finalTranscript = ''; - let interimTranscript = '' - - for (let i = speechEvent.resultIndex; i < speechEvent.results.length; ++i) { - const transcript = speechEvent.results[i][0].transcript; - - if (speechEvent.results[i].isFinal) { - let interim = BrowserSttProvider.capitalizeInterim(transcript); - if (interim != '') { - let final = finalTranscript; - final = BrowserSttProvider.composeValues(final, interim); - if (final.slice(-1) != '.' & final.slice(-1) != '?') final += '.'; - finalTranscript = final; - recognition.abort(); - listening = false; - } - interimTranscript = ' '; - } else { - interimTranscript += transcript; - } - } - - interimTranscript = BrowserSttProvider.capitalizeInterim(interimTranscript); - - textarea.val(initialText + finalTranscript + interimTranscript); - }; - - recognition.onerror = function (event) { - console.error('Error occurred in recognition:', event.error); - //if ($('#speech_recognition_debug').is(':checked')) - // toastr.error('Error occurred in recognition:'+ event.error, 'STT Generation error (Browser)', { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true }); - }; - - recognition.onend = function () { - listening = false; - button.toggleClass('fa-microphone fa-microphone-slash'); - const newText = textarea.val().substring(initialText.length); - textarea.val(textarea.val().substring(0,initialText.length)); - processTranscript(newText); - - }; - - recognition.onstart = function () { - initialText = textarea.val(); - button.toggleClass('fa-microphone fa-microphone-slash'); - - if ($("#speech_recognition_message_mode").val() == "replace") { - textarea.val(""); - initialText = "" - } - }; - - $("#microphone_button").show(); - - console.debug(DEBUG_PREFIX+"Browser STT settings loaded") - } - - -} diff --git a/public/scripts/extensions/speech-recognition/index.js b/public/scripts/extensions/speech-recognition/index.js deleted file mode 100644 index 3ac3df229..000000000 --- a/public/scripts/extensions/speech-recognition/index.js +++ /dev/null @@ -1,452 +0,0 @@ -/* -TODO: - - try pseudo streaming audio by just sending chunk every X seconds and asking VOSK if it is full text. -*/ - -import { saveSettingsDebounced } from "../../../script.js"; -import { getContext, extension_settings, ModuleWorkerWrapper } from "../../extensions.js"; -import { VoskSttProvider } from './vosk.js' -import { WhisperSttProvider } from './whisper.js' -import { BrowserSttProvider } from './browser.js' -import { StreamingSttProvider } from './streaming.js' -import { getMessageTimeStamp } from "../../RossAscends-mods.js"; -export { MODULE_NAME }; - -const MODULE_NAME = 'Speech Recognition'; -const DEBUG_PREFIX = " " -const UPDATE_INTERVAL = 100; - -let inApiCall = false; - -let sttProviders = { - None: null, - Browser: BrowserSttProvider, - Whisper: WhisperSttProvider, - Vosk: VoskSttProvider, - Streaming: StreamingSttProvider, -} - -let sttProvider = null -let sttProviderName = "None" - -let audioRecording = false -const constraints = { audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } }; -let audioChunks = []; - -async function moduleWorker() { - if (sttProviderName != "Streaming") { - return; - } - - // API is busy - if (inApiCall) { - return; - } - - try { - inApiCall = true; - const userMessageOriginal = await sttProvider.getUserMessage(); - let userMessageFormatted = userMessageOriginal.trim(); - - if (userMessageFormatted.length > 0) - { - console.debug(DEBUG_PREFIX+"recorded transcript: \""+userMessageFormatted+"\""); - - let userMessageLower = userMessageFormatted.toLowerCase(); - // remove punctuation - let userMessageRaw = userMessageLower.replace(/[^\w\s\']|_/g, "").replace(/\s+/g, " "); - - console.debug(DEBUG_PREFIX+"raw transcript:",userMessageRaw); - - // Detect trigger words - let messageStart = -1; - - if (extension_settings.speech_recognition.Streaming.triggerWordsEnabled) { - - for (const triggerWord of extension_settings.speech_recognition.Streaming.triggerWords) { - const triggerPos = userMessageRaw.indexOf(triggerWord.toLowerCase()); - - // Trigger word not found or not starting message and just a substring - if (triggerPos == -1){ // | (triggerPos > 0 & userMessageFormatted[triggerPos-1] != " ")) { - console.debug(DEBUG_PREFIX+"trigger word not found: ", triggerWord); - } - else { - console.debug(DEBUG_PREFIX+"Found trigger word: ", triggerWord, " at index ", triggerPos); - if (triggerPos < messageStart || messageStart == -1) { // & (triggerPos + triggerWord.length) < userMessageFormatted.length)) { - messageStart = triggerPos; // + triggerWord.length + 1; - - if (!extension_settings.speech_recognition.Streaming.triggerWordsIncluded) - messageStart = triggerPos + triggerWord.length + 1; - } - } - } - } else { - messageStart = 0; - } - - if (messageStart == -1) { - console.debug(DEBUG_PREFIX+"message ignored, no trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\""); - if (extension_settings.speech_recognition.Streaming.debug) { - toastr.info( - "No trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\"", - DEBUG_PREFIX+"message ignored.", - { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true }, - ); - } - } - else{ - userMessageFormatted = userMessageFormatted.substring(messageStart); - // Trim non alphanumeric character from the start - messageStart = 0; - for(const i of userMessageFormatted) { - if(/^[a-z]$/i.test(i)) { - break; - } - messageStart += 1; - } - userMessageFormatted = userMessageFormatted.substring(messageStart); - userMessageFormatted = userMessageFormatted.charAt(0).toUpperCase() + userMessageFormatted.substring(1); - processTranscript(userMessageFormatted); - } - } - else - { - console.debug(DEBUG_PREFIX+"Received empty transcript, ignored"); - } - } - catch (error) { - console.debug(error); - } - finally { - inApiCall = false; - } -} - -async function processTranscript(transcript) { - try { - const transcriptOriginal = transcript; - let transcriptFormatted = transcriptOriginal.trim(); - - if (transcriptFormatted.length > 0) - { - console.debug(DEBUG_PREFIX+"recorded transcript: \""+transcriptFormatted+"\""); - const messageMode = extension_settings.speech_recognition.messageMode; - console.debug(DEBUG_PREFIX+"mode: "+messageMode); - - let transcriptLower = transcriptFormatted.toLowerCase() - // remove punctuation - let transcriptRaw = transcriptLower.replace(/[^\w\s\']|_/g, "").replace(/\s+/g, " "); - - // Check message mapping - if (extension_settings.speech_recognition.messageMappingEnabled) { - console.debug(DEBUG_PREFIX+"Start searching message mapping into:",transcriptRaw) - for (const key in extension_settings.speech_recognition.messageMapping) { - console.debug(DEBUG_PREFIX+"message mapping searching: ", key,"=>",extension_settings.speech_recognition.messageMapping[key]); - if (transcriptRaw.includes(key)) { - var message = extension_settings.speech_recognition.messageMapping[key]; - console.debug(DEBUG_PREFIX+"message mapping found: ", key,"=>",extension_settings.speech_recognition.messageMapping[key]); - $("#send_textarea").val(message); - - if (messageMode == "auto_send") await getContext().generate(); - return; - } - } - } - - console.debug(DEBUG_PREFIX+"no message mapping found, processing transcript as normal message"); - - switch (messageMode) { - case "auto_send": - $('#send_textarea').val("") // clear message area to avoid double message - - console.debug(DEBUG_PREFIX+"Sending message") - const context = getContext(); - const messageText = transcriptFormatted; - const message = { - name: context.name1, - is_user: true, - send_date: getMessageTimeStamp(), - mes: messageText, - }; - context.chat.push(message); - context.addOneMessage(message); - - await context.generate(); - - $('#debug_output').text(": message sent: \""+ transcriptFormatted +"\""); - break; - - case "replace": - console.debug(DEBUG_PREFIX+"Replacing message") - $('#send_textarea').val(transcriptFormatted); - break; - - case "append": - console.debug(DEBUG_PREFIX+"Appending message") - $('#send_textarea').val($('#send_textarea').val()+" "+transcriptFormatted); - break; - - default: - console.debug(DEBUG_PREFIX+"Not supported stt message mode: "+messageMode) - - } - } - else - { - console.debug(DEBUG_PREFIX+"Empty transcript, do nothing"); - } - } - catch (error) { - console.debug(error); - } -} - -function loadNavigatorAudioRecording() { - if (navigator.mediaDevices.getUserMedia) { - console.debug(DEBUG_PREFIX+' getUserMedia supported by browser.'); - - let onSuccess = function(stream) { - const mediaRecorder = new MediaRecorder(stream); - - $("#microphone_button").off('click').on("click", function() { - if (!audioRecording) { - mediaRecorder.start(); - console.debug(mediaRecorder.state); - console.debug("recorder started"); - audioRecording = true; - $("#microphone_button").toggleClass('fa-microphone fa-microphone-slash'); - } - else { - mediaRecorder.stop(); - console.debug(mediaRecorder.state); - console.debug("recorder stopped"); - audioRecording = false; - $("#microphone_button").toggleClass('fa-microphone fa-microphone-slash'); - } - }); - - mediaRecorder.onstop = async function() { - console.debug(DEBUG_PREFIX+"data available after MediaRecorder.stop() called: ", audioChunks.length, " chunks"); - const audioBlob = new Blob(audioChunks, { type: "audio/wav; codecs=0" }); - audioChunks = []; - - const transcript = await sttProvider.processAudio(audioBlob); - - // TODO: lock and release recording while processing? - console.debug(DEBUG_PREFIX+"received transcript:", transcript); - processTranscript(transcript); - } - - mediaRecorder.ondataavailable = function(e) { - audioChunks.push(e.data); - } - } - - let onError = function(err) { - console.debug(DEBUG_PREFIX+"The following error occured: " + err); - } - - navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError); - - } else { - console.debug(DEBUG_PREFIX+"getUserMedia not supported on your browser!"); - toastr.error("getUserMedia not supported", DEBUG_PREFIX+"not supported for your browser.", { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true }); - } -} - -//##############// -// STT Provider // -//##############// - -function loadSttProvider(provider) { - //Clear the current config and add new config - $("#speech_recognition_provider_settings").html(""); - - // Init provider references - extension_settings.speech_recognition.currentProvider = provider; - sttProviderName = provider; - - if (!(sttProviderName in extension_settings.speech_recognition)) { - console.warn(`Provider ${sttProviderName} not in Extension Settings, initiatilizing provider in settings`); - extension_settings.speech_recognition[sttProviderName] = {}; - } - - $('#speech_recognition_provider').val(sttProviderName); - - if (sttProviderName == "None") { - $("#microphone_button").hide(); - $("#speech_recognition_message_mode_div").hide(); - $("#speech_recognition_message_mapping_div").hide(); - return; - } - - $("#speech_recognition_message_mode_div").show(); - $("#speech_recognition_message_mapping_div").show(); - - sttProvider = new sttProviders[sttProviderName] - - // Init provider settings - $('#speech_recognition_provider_settings').append(sttProvider.settingsHtml); - - // Use microphone button as push to talk - if (sttProviderName == "Browser") { - sttProvider.processTranscriptFunction = processTranscript; - sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); - $("#microphone_button").show(); - } - - if (sttProviderName == "Vosk" | sttProviderName == "Whisper") { - sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); - loadNavigatorAudioRecording(); - $("#microphone_button").show(); - } - - if (sttProviderName == "Streaming") { - sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); - $("#microphone_button").off('click'); - $("#microphone_button").hide(); - } - -} - -function onSttProviderChange() { - const sttProviderSelection = $('#speech_recognition_provider').val(); - loadSttProvider(sttProviderSelection); - saveSettingsDebounced(); -} - -function onSttProviderSettingsInput() { - sttProvider.onSettingsChange(); - - // Persist changes to SillyTavern stt extension settings - extension_settings.speech_recognition[sttProviderName] = sttProvider.settings; - saveSettingsDebounced(); - console.info(`Saved settings ${sttProviderName} ${JSON.stringify(sttProvider.settings)}`); -} - -//#############################// -// Extension UI and Settings // -//#############################// - -const defaultSettings = { - currentProvider: "None", - messageMode: "append", - messageMappingText: "", - messageMapping: [], - messageMappingEnabled: false, -} - -function loadSettings() { - if (Object.keys(extension_settings.speech_recognition).length === 0) { - Object.assign(extension_settings.speech_recognition, defaultSettings) - } - $('#speech_recognition_enabled').prop('checked',extension_settings.speech_recognition.enabled); - $('#speech_recognition_message_mode').val(extension_settings.speech_recognition.messageMode); - - if (extension_settings.speech_recognition.messageMappingText.length > 0) { - $('#speech_recognition_message_mapping').val(extension_settings.speech_recognition.messageMappingText); - } - - $('#speech_recognition_message_mapping_enabled').prop('checked',extension_settings.speech_recognition.messageMappingEnabled); -} - -async function onMessageModeChange() { - extension_settings.speech_recognition.messageMode = $('#speech_recognition_message_mode').val(); - - if(sttProviderName != "Browser" & extension_settings.speech_recognition.messageMode == "auto_send") { - $("#speech_recognition_wait_response_div").show() - } - else { - $("#speech_recognition_wait_response_div").hide() - } - - saveSettingsDebounced(); -} - -async function onMessageMappingChange() { - let array = $('#speech_recognition_message_mapping').val().split(","); - array = array.map(element => {return element.trim();}); - array = array.filter((str) => str !== ''); - extension_settings.speech_recognition.messageMapping = {}; - for (const text of array) { - if (text.includes("=")) { - const pair = text.toLowerCase().split("=") - extension_settings.speech_recognition.messageMapping[pair[0].trim()] = pair[1].trim() - console.debug(DEBUG_PREFIX+"Added mapping", pair[0],"=>", extension_settings.speech_recognition.messageMapping[pair[0]]); - } - else { - console.debug(DEBUG_PREFIX+"Wrong syntax for message mapping, no '=' found in:", text); - } - } - - $("#speech_recognition_message_mapping_status").text("Message mapping updated to: "+JSON.stringify(extension_settings.speech_recognition.messageMapping)) - console.debug(DEBUG_PREFIX+"Updated message mapping", extension_settings.speech_recognition.messageMapping); - extension_settings.speech_recognition.messageMappingText = $('#speech_recognition_message_mapping').val() - saveSettingsDebounced(); -} - -async function onMessageMappingEnabledClick() { - extension_settings.speech_recognition.messageMappingEnabled = $('#speech_recognition_message_mapping_enabled').is(':checked'); - saveSettingsDebounced() -} - -$(document).ready(function () { - function addExtensionControls() { - const settingsHtml = ` -
-
-
- Speech Recognition -
-
-
-
- Select Speech-to-text Provider
- -
-
- Message Mode
- -
-
- Message Mapping - - - -
-
-
-
-
-
- `; - $('#extensions_settings').append(settingsHtml); - $('#speech_recognition_provider_settings').on('input', onSttProviderSettingsInput); - for (const provider in sttProviders) { - $('#speech_recognition_provider').append($("