mirror of
				https://github.com/SillyTavern/SillyTavern.git
				synced 2025-06-05 21:59:27 +02:00 
			
		
		
		
	Restored streaming mode as a new provider "Streaming", recording is done on server side, voice detection with vosk and transcript with whisper.
This commit is contained in:
		| @@ -8,16 +8,21 @@ import { getContext, getApiUrl, modules, extension_settings, ModuleWorkerWrapper | |||||||
| import { VoskSttProvider } from './vosk.js' | import { VoskSttProvider } from './vosk.js' | ||||||
| import { WhisperSttProvider } from './whisper.js' | import { WhisperSttProvider } from './whisper.js' | ||||||
| import { BrowserSttProvider } from './browser.js' | import { BrowserSttProvider } from './browser.js' | ||||||
|  | import { StreamingSttProvider } from './streaming.js' | ||||||
| export { MODULE_NAME }; | export { MODULE_NAME }; | ||||||
|  |  | ||||||
| const MODULE_NAME = 'Speech Recognition'; | const MODULE_NAME = 'Speech Recognition'; | ||||||
| const DEBUG_PREFIX = "<Speech Recognition module> " | const DEBUG_PREFIX = "<Speech Recognition module> " | ||||||
|  | const UPDATE_INTERVAL = 100; | ||||||
|  |  | ||||||
|  | let inApiCall = false; | ||||||
|  |  | ||||||
| let sttProviders = { | let sttProviders = { | ||||||
|     None: null, |     None: null, | ||||||
|     Browser: BrowserSttProvider, |     Browser: BrowserSttProvider, | ||||||
|     Whisper: WhisperSttProvider, |     Whisper: WhisperSttProvider, | ||||||
|     Vosk: VoskSttProvider, |     Vosk: VoskSttProvider, | ||||||
|  |     Streaming: StreamingSttProvider, | ||||||
| } | } | ||||||
|  |  | ||||||
| let sttProvider = null | let sttProvider = null | ||||||
| @@ -27,6 +32,82 @@ let audioRecording = false | |||||||
| const constraints = { audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } }; | const constraints = { audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } }; | ||||||
| let audioChunks = []; | let audioChunks = []; | ||||||
|  |  | ||||||
|  | async function moduleWorker() { | ||||||
|  |     if (sttProviderName != "Streaming") { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // API is busy | ||||||
|  |     if (inApiCall) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     try { | ||||||
|  |         inApiCall = true; | ||||||
|  |         const userMessageOriginal =  await sttProvider.getUserMessage(); | ||||||
|  |         let userMessageFormatted = userMessageOriginal.trim(); | ||||||
|  |  | ||||||
|  |         if (userMessageFormatted.length > 0) | ||||||
|  |         { | ||||||
|  |             console.debug(DEBUG_PREFIX+"recorded transcript: \""+userMessageFormatted+"\""); | ||||||
|  |  | ||||||
|  |             let userMessageLower = userMessageFormatted.toLowerCase(); | ||||||
|  |             // remove punctuation | ||||||
|  |             let userMessageRaw = userMessageLower.replace(/[^\w\s\']|_/g, "").replace(/\s+/g, " "); | ||||||
|  |  | ||||||
|  |             console.debug(DEBUG_PREFIX+"raw transcript:",userMessageRaw); | ||||||
|  |  | ||||||
|  |             // Detect trigger words | ||||||
|  |             let messageStart = -1; | ||||||
|  |  | ||||||
|  |             if (extension_settings.speech_recognition.Streaming.triggerWordsEnabled) { | ||||||
|  |              | ||||||
|  |                 for (const triggerWord of extension_settings.speech_recognition.Streaming.triggerWords) { | ||||||
|  |                     const triggerPos = userMessageRaw.indexOf(triggerWord.toLowerCase()); | ||||||
|  |                      | ||||||
|  |                     // Trigger word not found or not starting message and just a substring | ||||||
|  |                     if (triggerPos == -1){ // | (triggerPos > 0 & userMessageFormatted[triggerPos-1] != " ")) { | ||||||
|  |                         console.debug(DEBUG_PREFIX+"trigger word not found: ", triggerWord); | ||||||
|  |                     } | ||||||
|  |                     else { | ||||||
|  |                         console.debug(DEBUG_PREFIX+"Found trigger word: ", triggerWord, " at index ", triggerPos); | ||||||
|  |                         if (triggerPos < messageStart | messageStart == -1) { // & (triggerPos + triggerWord.length) < userMessageFormatted.length)) { | ||||||
|  |                             messageStart = triggerPos; // + triggerWord.length + 1; | ||||||
|  |                         } | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |             } else { | ||||||
|  |                 messageStart = 0; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             if (messageStart == -1) { | ||||||
|  |                 console.debug(DEBUG_PREFIX+"message ignored, no trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\""); | ||||||
|  |                 if (extension_settings.speech_recognition.Streaming.debug) { | ||||||
|  |                     toastr.info( | ||||||
|  |                         "No trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\"", | ||||||
|  |                         DEBUG_PREFIX+"message ignored.", | ||||||
|  |                         { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true }, | ||||||
|  |                     ); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             else{ | ||||||
|  |                 userMessageFormatted = userMessageFormatted.substring(messageStart); | ||||||
|  |                 processTranscript(userMessageFormatted); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|  |             console.debug(DEBUG_PREFIX+"Received empty transcript, ignored"); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     catch (error) { | ||||||
|  |         console.debug(error); | ||||||
|  |     } | ||||||
|  |     finally { | ||||||
|  |         inApiCall = false; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| async function processTranscript(transcript) { | async function processTranscript(transcript) { | ||||||
|     try { |     try { | ||||||
|         const transcriptOriginal =  transcript; |         const transcriptOriginal =  transcript; | ||||||
| @@ -198,13 +279,21 @@ function loadSttProvider(provider) { | |||||||
|     if (sttProviderName == "Browser") { |     if (sttProviderName == "Browser") { | ||||||
|         sttProvider.processTranscriptFunction = processTranscript; |         sttProvider.processTranscriptFunction = processTranscript; | ||||||
|         sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); |         sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); | ||||||
|     } |  | ||||||
|     else { |  | ||||||
|         sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); |  | ||||||
|         loadNavigatorAudioRecording(); |  | ||||||
|  |  | ||||||
|         $("#microphone_button").show(); |         $("#microphone_button").show(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     if (sttProviderName == "Vosk" | sttProviderName == "Whisper") { | ||||||
|  |         sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); | ||||||
|  |         loadNavigatorAudioRecording(); | ||||||
|  |         $("#microphone_button").show(); | ||||||
|  |     } | ||||||
|  |      | ||||||
|  |     if (sttProviderName == "Streaming") { | ||||||
|  |         sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]); | ||||||
|  |         $("#microphone_button").off('click'); | ||||||
|  |         $("#microphone_button").hide(); | ||||||
|  |     } | ||||||
|  |      | ||||||
| } | } | ||||||
|  |  | ||||||
| function onSttProviderChange() { | function onSttProviderChange() { | ||||||
| @@ -231,7 +320,7 @@ const defaultSettings = { | |||||||
|     messageMode: "append", |     messageMode: "append", | ||||||
|     messageMappingText: "", |     messageMappingText: "", | ||||||
|     messageMapping: [], |     messageMapping: [], | ||||||
|     messageMappingEnabled: false |     messageMappingEnabled: false, | ||||||
| } | } | ||||||
|  |  | ||||||
| function loadSettings() { | function loadSettings() { | ||||||
| @@ -344,8 +433,7 @@ $(document).ready(function () { | |||||||
|     addExtensionControls(); // No init dependencies |     addExtensionControls(); // No init dependencies | ||||||
|     loadSettings(); // Depends on Extension Controls and loadTtsProvider |     loadSettings(); // Depends on Extension Controls and loadTtsProvider | ||||||
|     loadSttProvider(extension_settings.speech_recognition.currentProvider); // No dependencies |     loadSttProvider(extension_settings.speech_recognition.currentProvider); // No dependencies | ||||||
|  |     const wrapper = new ModuleWorkerWrapper(moduleWorker); | ||||||
|     //const wrapper = new ModuleWorkerWrapper(moduleWorker); |     setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things | ||||||
|     //setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things |     moduleWorker(); | ||||||
|     //moduleWorker(); |  | ||||||
| }) | }) | ||||||
|   | |||||||
							
								
								
									
										102
									
								
								public/scripts/extensions/speech-recognition/streaming.js
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										102
									
								
								public/scripts/extensions/speech-recognition/streaming.js
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,102 @@ | |||||||
|  | import { getApiUrl, doExtrasFetch, modules } from "../../extensions.js"; | ||||||
|  | export { StreamingSttProvider } | ||||||
|  |  | ||||||
|  | const DEBUG_PREFIX = "<Speech Recognition module (streaming)> " | ||||||
|  |  | ||||||
|  | class StreamingSttProvider { | ||||||
|  |     //########// | ||||||
|  |     // Config // | ||||||
|  |     //########// | ||||||
|  |  | ||||||
|  |     settings | ||||||
|  |  | ||||||
|  |     defaultSettings = { | ||||||
|  |         triggerWordsText: "", | ||||||
|  |         triggerWords : [], | ||||||
|  |         triggerWordsEnabled : false, | ||||||
|  |         debug : false, | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     get settingsHtml() { | ||||||
|  |         let html = '\ | ||||||
|  |         <div id="speech_recognition_streaming_trigger_words_div">\ | ||||||
|  |         <span>Trigger words</span>\ | ||||||
|  |         <textarea id="speech_recognition_streaming_trigger_words" class="text_pole textarea_compact" type="text" rows="4" placeholder="Enter comma separated words that triggers new message, example:\nhey, hey aqua, record, listen"></textarea>\ | ||||||
|  |         <label class="checkbox_label" for="speech_recognition_streaming_trigger_words_enabled">\ | ||||||
|  |             <input type="checkbox" id="speech_recognition_streaming_trigger_words_enabled" name="speech_recognition_trigger_words_enabled">\ | ||||||
|  |             <small>Enable trigger words</small>\ | ||||||
|  |         </label>\ | ||||||
|  |         <label class="checkbox_label" for="speech_recognition_streaming_debug">\ | ||||||
|  |             <input type="checkbox" id="speech_recognition_streaming_debug" name="speech_recognition_streaming_debug">\ | ||||||
|  |             <small>Enable debug pop ups</small>\ | ||||||
|  |         </label>\ | ||||||
|  |         </div>\ | ||||||
|  |         ' | ||||||
|  |         return html | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     onSettingsChange() { | ||||||
|  |         this.settings.triggerWordsText = $('#speech_recognition_streaming_trigger_words').val(); | ||||||
|  |         let array = $('#speech_recognition_streaming_trigger_words').val().split(","); | ||||||
|  |         array = array.map(element => {return element.trim().toLowerCase();}); | ||||||
|  |         array = array.filter((str) => str !== ''); | ||||||
|  |         this.settings.triggerWords = array; | ||||||
|  |         this.settings.triggerWordsEnabled = $("#speech_recognition_streaming_trigger_words_enabled").is(':checked'); | ||||||
|  |         this.settings.debug = $("#speech_recognition_streaming_debug").is(':checked'); | ||||||
|  |         console.debug(DEBUG_PREFIX+" Updated settings: ", this.settings); | ||||||
|  |         this.loadSettings(this.settings); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     loadSettings(settings) { | ||||||
|  |         // Populate Provider UI given input settings | ||||||
|  |         if (Object.keys(settings).length == 0) { | ||||||
|  |             console.debug(DEBUG_PREFIX+"Using default Whisper STT extension settings") | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // Only accept keys defined in defaultSettings | ||||||
|  |         this.settings = this.defaultSettings | ||||||
|  |  | ||||||
|  |         for (const key in settings){ | ||||||
|  |             if (key in this.settings){ | ||||||
|  |                 this.settings[key] = settings[key] | ||||||
|  |             } else { | ||||||
|  |                 throw `Invalid setting passed to STT extension: ${key}` | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         $("#speech_recognition_streaming_trigger_words").val(this.settings.triggerWordsText); | ||||||
|  |         $("#speech_recognition_streaming_trigger_words_enabled").prop('checked',this.settings.triggerWordsEnabled); | ||||||
|  |         $("#speech_recognition_streaming_debug").prop('checked',this.settings.debug); | ||||||
|  |  | ||||||
|  |         console.debug(DEBUG_PREFIX+"streaming STT settings loaded") | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     async getUserMessage() { | ||||||
|  |         // Return if module is not loaded | ||||||
|  |         if (!modules.includes('streaming-stt')) { | ||||||
|  |             console.debug(DEBUG_PREFIX+"Module streaming-stt must be activated in Sillytavern Extras for streaming user voice.") | ||||||
|  |             return ""; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         const url = new URL(getApiUrl()); | ||||||
|  |         url.pathname = '/api/speech-recognition/streaming/record-and-transcript'; | ||||||
|  |  | ||||||
|  |         const apiResult = await doExtrasFetch(url, { | ||||||
|  |             method: 'POST', | ||||||
|  |             headers: { | ||||||
|  |                 'Content-Type': 'application/json', | ||||||
|  |                 'Bypass-Tunnel-Reminder': 'bypass', | ||||||
|  |             }, | ||||||
|  |             body: JSON.stringify({ text: "" }), | ||||||
|  |         }); | ||||||
|  |  | ||||||
|  |         if (!apiResult.ok) { | ||||||
|  |             toastr.error(apiResult.statusText, DEBUG_PREFIX+'STT Generation Failed  (streaming)', { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true }); | ||||||
|  |             throw new Error(`HTTP ${apiResult.status}: ${await apiResult.text()}`); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         const data = await apiResult.json(); | ||||||
|  |         return data.transcript; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  | } | ||||||
		Reference in New Issue
	
	Block a user