mirror of
				https://github.com/SillyTavern/SillyTavern.git
				synced 2025-06-05 21:59:27 +02:00 
			
		
		
		
	Restored streaming mode as a new provider "Streaming", recording is done on server side, voice detection with vosk and transcript with whisper.
This commit is contained in:
		| @@ -8,16 +8,21 @@ import { getContext, getApiUrl, modules, extension_settings, ModuleWorkerWrapper | ||||
| import { VoskSttProvider } from './vosk.js' | ||||
| import { WhisperSttProvider } from './whisper.js' | ||||
| import { BrowserSttProvider } from './browser.js' | ||||
| import { StreamingSttProvider } from './streaming.js' | ||||
| export { MODULE_NAME }; | ||||
|  | ||||
| const MODULE_NAME = 'Speech Recognition'; | ||||
| const DEBUG_PREFIX = "<Speech Recognition module> " | ||||
| const UPDATE_INTERVAL = 100; | ||||
|  | ||||
| let inApiCall = false; | ||||
|  | ||||
| let sttProviders = { | ||||
|     None: null, | ||||
|     Browser: BrowserSttProvider, | ||||
|     Whisper: WhisperSttProvider, | ||||
|     Vosk: VoskSttProvider, | ||||
|     Streaming: StreamingSttProvider, | ||||
| } | ||||
|  | ||||
| let sttProvider = null | ||||
| @@ -27,6 +32,82 @@ let audioRecording = false | ||||
| const constraints = { audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } }; | ||||
| let audioChunks = []; | ||||
|  | ||||
| async function moduleWorker() { | ||||
|     if (sttProviderName != "Streaming") { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     // API is busy | ||||
|     if (inApiCall) { | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     try { | ||||
|         inApiCall = true; | ||||
|         const userMessageOriginal =  await sttProvider.getUserMessage(); | ||||
|         let userMessageFormatted = userMessageOriginal.trim(); | ||||
|  | ||||
|         if (userMessageFormatted.length > 0) | ||||
|         { | ||||
|             console.debug(DEBUG_PREFIX+"recorded transcript: \""+userMessageFormatted+"\""); | ||||
|  | ||||
|             let userMessageLower = userMessageFormatted.toLowerCase(); | ||||
|             // remove punctuation | ||||
|             let userMessageRaw = userMessageLower.replace(/[^\w\s\']|_/g, "").replace(/\s+/g, " "); | ||||
|  | ||||
|             console.debug(DEBUG_PREFIX+"raw transcript:",userMessageRaw); | ||||
|  | ||||
|             // Detect trigger words | ||||
|             let messageStart = -1; | ||||
|  | ||||
|             if (extension_settings.speech_recognition.Streaming.triggerWordsEnabled) { | ||||
|              | ||||
|                 for (const triggerWord of extension_settings.speech_recognition.Streaming.triggerWords) { | ||||
|                     const triggerPos = userMessageRaw.indexOf(triggerWord.toLowerCase()); | ||||
|                      | ||||
|                     // Trigger word not found or not starting message and just a substring | ||||
|                     if (triggerPos == -1){ // | (triggerPos > 0 & userMessageFormatted[triggerPos-1] != " ")) { | ||||
|                         console.debug(DEBUG_PREFIX+"trigger word not found: ", triggerWord); | ||||
|                     } | ||||
|                     else { | ||||
|                         console.debug(DEBUG_PREFIX+"Found trigger word: ", triggerWord, " at index ", triggerPos); | ||||
|                         if (triggerPos < messageStart | messageStart == -1) { // & (triggerPos + triggerWord.length) < userMessageFormatted.length)) { | ||||
|                             messageStart = triggerPos; // + triggerWord.length + 1; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } else { | ||||
|                 messageStart = 0; | ||||
|             } | ||||
|  | ||||
|             if (messageStart == -1) { | ||||
|                 console.debug(DEBUG_PREFIX+"message ignored, no trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\""); | ||||
|                 if (extension_settings.speech_recognition.Streaming.debug) { | ||||
|                     toastr.info( | ||||
|                         "No trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\"", | ||||
|                         DEBUG_PREFIX+"message ignored.", | ||||
|                         { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true }, | ||||
|                     ); | ||||
|                 } | ||||
|             } | ||||
|             else{ | ||||
|                 userMessageFormatted = userMessageFormatted.substring(messageStart); | ||||
|                 processTranscript(userMessageFormatted); | ||||
|             } | ||||
|         } | ||||
|         else | ||||
|         { | ||||
|             console.debug(DEBUG_PREFIX+"Received empty transcript, ignored"); | ||||
|         } | ||||
|     } | ||||
|     catch (error) { | ||||
|         console.debug(error); | ||||
|     } | ||||
|     finally { | ||||
|         inApiCall = false; | ||||
|     } | ||||
| } | ||||
|  | ||||
| async function processTranscript(transcript) { | ||||
|     try { | ||||
|         const transcriptOriginal =  transcript; | ||||
| @@ -198,13 +279,21 @@ function loadSttProvider(provider) { | ||||
|     if (sttProviderName == "Browser") { | ||||
|         sttProvider.processTranscriptFunction = processTranscript; | ||||
|         sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); | ||||
|     } | ||||
|     else { | ||||
|         sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); | ||||
|         loadNavigatorAudioRecording(); | ||||
|  | ||||
|         $("#microphone_button").show(); | ||||
|     } | ||||
|  | ||||
|     if (sttProviderName == "Vosk" | sttProviderName == "Whisper") { | ||||
|         sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); | ||||
|         loadNavigatorAudioRecording(); | ||||
|         $("#microphone_button").show(); | ||||
|     } | ||||
|      | ||||
|     if (sttProviderName == "Streaming") { | ||||
|         sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); | ||||
|         $("#microphone_button").off('click'); | ||||
|         $("#microphone_button").hide(); | ||||
|     } | ||||
|      | ||||
| } | ||||
|  | ||||
| function onSttProviderChange() { | ||||
| @@ -231,7 +320,7 @@ const defaultSettings = { | ||||
|     messageMode: "append", | ||||
|     messageMappingText: "", | ||||
|     messageMapping: [], | ||||
|     messageMappingEnabled: false | ||||
|     messageMappingEnabled: false, | ||||
| } | ||||
|  | ||||
| function loadSettings() { | ||||
| @@ -344,8 +433,7 @@ $(document).ready(function () { | ||||
|     addExtensionControls(); // No init dependencies | ||||
|     loadSettings(); // Depends on Extension Controls and loadTtsProvider | ||||
|     loadSttProvider(extension_settings.speech_recognition.currentProvider); // No dependencies | ||||
|  | ||||
|     //const wrapper = new ModuleWorkerWrapper(moduleWorker); | ||||
|     //setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things | ||||
|     //moduleWorker(); | ||||
|     const wrapper = new ModuleWorkerWrapper(moduleWorker); | ||||
|     setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things | ||||
|     moduleWorker(); | ||||
| }) | ||||
|   | ||||
							
								
								
									
										102
									
								
								public/scripts/extensions/speech-recognition/streaming.js
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										102
									
								
								public/scripts/extensions/speech-recognition/streaming.js
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,102 @@ | ||||
| import { getApiUrl, doExtrasFetch, modules } from "../../extensions.js"; | ||||
| export { StreamingSttProvider } | ||||
|  | ||||
| const DEBUG_PREFIX = "<Speech Recognition module (streaming)> " | ||||
|  | ||||
| class StreamingSttProvider { | ||||
|     //########// | ||||
|     // Config // | ||||
|     //########// | ||||
|  | ||||
|     settings | ||||
|  | ||||
|     defaultSettings = { | ||||
|         triggerWordsText: "", | ||||
|         triggerWords : [], | ||||
|         triggerWordsEnabled : false, | ||||
|         debug : false, | ||||
|     } | ||||
|  | ||||
|     get settingsHtml() { | ||||
|         let html = '\ | ||||
|         <div id="speech_recognition_streaming_trigger_words_div">\ | ||||
|         <span>Trigger words</span>\ | ||||
|         <textarea id="speech_recognition_streaming_trigger_words" class="text_pole textarea_compact" type="text" rows="4" placeholder="Enter comma separated words that triggers new message, example:\nhey, hey aqua, record, listen"></textarea>\ | ||||
|         <label class="checkbox_label" for="speech_recognition_streaming_trigger_words_enabled">\ | ||||
|             <input type="checkbox" id="speech_recognition_streaming_trigger_words_enabled" name="speech_recognition_trigger_words_enabled">\ | ||||
|             <small>Enable trigger words</small>\ | ||||
|         </label>\ | ||||
|         <label class="checkbox_label" for="speech_recognition_streaming_debug">\ | ||||
|             <input type="checkbox" id="speech_recognition_streaming_debug" name="speech_recognition_streaming_debug">\ | ||||
|             <small>Enable debug pop ups</small>\ | ||||
|         </label>\ | ||||
|         </div>\ | ||||
|         ' | ||||
|         return html | ||||
|     } | ||||
|  | ||||
|     onSettingsChange() { | ||||
|         this.settings.triggerWordsText = $('#speech_recognition_streaming_trigger_words').val(); | ||||
|         let array = $('#speech_recognition_streaming_trigger_words').val().split(","); | ||||
|         array = array.map(element => {return element.trim().toLowerCase();}); | ||||
|         array = array.filter((str) => str !== ''); | ||||
|         this.settings.triggerWords = array; | ||||
|         this.settings.triggerWordsEnabled = $("#speech_recognition_streaming_trigger_words_enabled").is(':checked'); | ||||
|         this.settings.debug = $("#speech_recognition_streaming_debug").is(':checked'); | ||||
|         console.debug(DEBUG_PREFIX+" Updated settings: ", this.settings); | ||||
|         this.loadSettings(this.settings); | ||||
|     } | ||||
|  | ||||
|     loadSettings(settings) { | ||||
|         // Populate Provider UI given input settings | ||||
|         if (Object.keys(settings).length == 0) { | ||||
|             console.debug(DEBUG_PREFIX+"Using default Whisper STT extension settings") | ||||
|         } | ||||
|  | ||||
|         // Only accept keys defined in defaultSettings | ||||
|         this.settings = this.defaultSettings | ||||
|  | ||||
|         for (const key in settings){ | ||||
|             if (key in this.settings){ | ||||
|                 this.settings[key] = settings[key] | ||||
|             } else { | ||||
|                 throw `Invalid setting passed to STT extension: ${key}` | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         $("#speech_recognition_streaming_trigger_words").val(this.settings.triggerWordsText); | ||||
|         $("#speech_recognition_streaming_trigger_words_enabled").prop('checked',this.settings.triggerWordsEnabled); | ||||
|         $("#speech_recognition_streaming_debug").prop('checked',this.settings.debug); | ||||
|  | ||||
|         console.debug(DEBUG_PREFIX+"streaming STT settings loaded") | ||||
|     } | ||||
|  | ||||
|     async getUserMessage() { | ||||
|         // Return if module is not loaded | ||||
|         if (!modules.includes('streaming-stt')) { | ||||
|             console.debug(DEBUG_PREFIX+"Module streaming-stt must be activated in Sillytavern Extras for streaming user voice.") | ||||
|             return ""; | ||||
|         } | ||||
|  | ||||
|         const url = new URL(getApiUrl()); | ||||
|         url.pathname = '/api/speech-recognition/streaming/record-and-transcript'; | ||||
|  | ||||
|         const apiResult = await doExtrasFetch(url, { | ||||
|             method: 'POST', | ||||
|             headers: { | ||||
|                 'Content-Type': 'application/json', | ||||
|                 'Bypass-Tunnel-Reminder': 'bypass', | ||||
|             }, | ||||
|             body: JSON.stringify({ text: "" }), | ||||
|         }); | ||||
|  | ||||
|         if (!apiResult.ok) { | ||||
|             toastr.error(apiResult.statusText, DEBUG_PREFIX+'STT Generation Failed  (streaming)', { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true }); | ||||
|             throw new Error(`HTTP ${apiResult.status}: ${await apiResult.text()}`); | ||||
|         } | ||||
|  | ||||
|         const data = await apiResult.json(); | ||||
|         return data.transcript; | ||||
|     } | ||||
|  | ||||
| } | ||||
		Reference in New Issue
	
	Block a user