Restored streaming mode as a new provider "Streaming", recording is done on server side, voice detection with vosk and transcript with whisper.

2025-06-05 21:59:27 +02:00 · 2023-07-31 18:47:33 +02:00
parent 8aff89de30
commit 192c82b180
2 changed files with 200 additions and 10 deletions
--- a/public/scripts/extensions/speech-recognition/index.js
+++ b/public/scripts/extensions/speech-recognition/index.js
@@ -8,16 +8,21 @@ import { getContext, getApiUrl, modules, extension_settings, ModuleWorkerWrapper
 import { VoskSttProvider } from './vosk.js'
 import { WhisperSttProvider } from './whisper.js'
 import { BrowserSttProvider } from './browser.js'
 import { StreamingSttProvider } from './streaming.js'
 export { MODULE_NAME };
 const MODULE_NAME = 'Speech Recognition';
 const DEBUG_PREFIX = "<Speech Recognition module> "
 const UPDATE_INTERVAL = 100;
 let inApiCall = false;
 let sttProviders = {
    None: null,
    Browser: BrowserSttProvider,
    Whisper: WhisperSttProvider,
    Vosk: VoskSttProvider,
    Streaming: StreamingSttProvider,
 }
 let sttProvider = null
@@ -27,6 +32,82 @@ let audioRecording = false
 const constraints = { audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } };
 let audioChunks = [];
 async function moduleWorker() {
    if (sttProviderName != "Streaming") {
        return;
    }
    // API is busy
    if (inApiCall) {
        return;
    }
    try {
        inApiCall = true;
        const userMessageOriginal =  await sttProvider.getUserMessage();
        let userMessageFormatted = userMessageOriginal.trim();
        if (userMessageFormatted.length > 0)
        {
            console.debug(DEBUG_PREFIX+"recorded transcript: \""+userMessageFormatted+"\"");
            let userMessageLower = userMessageFormatted.toLowerCase();
            // remove punctuation
            let userMessageRaw = userMessageLower.replace(/[^\w\s\']|_/g, "").replace(/\s+/g, " ");
            console.debug(DEBUG_PREFIX+"raw transcript:",userMessageRaw);
            // Detect trigger words
            let messageStart = -1;
            if (extension_settings.speech_recognition.Streaming.triggerWordsEnabled) {
                for (const triggerWord of extension_settings.speech_recognition.Streaming.triggerWords) {
                    const triggerPos = userMessageRaw.indexOf(triggerWord.toLowerCase());
                    // Trigger word not found or not starting message and just a substring
                    if (triggerPos == -1){ // | (triggerPos > 0 & userMessageFormatted[triggerPos-1] != " ")) {
                        console.debug(DEBUG_PREFIX+"trigger word not found: ", triggerWord);
                    }
                    else {
                        console.debug(DEBUG_PREFIX+"Found trigger word: ", triggerWord, " at index ", triggerPos);
                        if (triggerPos < messageStart | messageStart == -1) { // & (triggerPos + triggerWord.length) < userMessageFormatted.length)) {
                            messageStart = triggerPos; // + triggerWord.length + 1;
                        }
                    }
                }
            } else {
                messageStart = 0;
            }
            if (messageStart == -1) {
                console.debug(DEBUG_PREFIX+"message ignored, no trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\"");
                if (extension_settings.speech_recognition.Streaming.debug) {
                    toastr.info(
                        "No trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\"",
                        DEBUG_PREFIX+"message ignored.",
                        { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true },
                    );
                }
            }
            else{
                userMessageFormatted = userMessageFormatted.substring(messageStart);
                processTranscript(userMessageFormatted);
            }
        }
        else
        {
            console.debug(DEBUG_PREFIX+"Received empty transcript, ignored");
        }
    }
    catch (error) {
        console.debug(error);
    }
    finally {
        inApiCall = false;
    }
 }
 async function processTranscript(transcript) {
    try {
        const transcriptOriginal =  transcript;
@@ -198,13 +279,21 @@ function loadSttProvider(provider) {
    if (sttProviderName == "Browser") {
        sttProvider.processTranscriptFunction = processTranscript;
        sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
    }
    else {
        sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
        loadNavigatorAudioRecording();
        $("#microphone_button").show();
    }
    if (sttProviderName == "Vosk" | sttProviderName == "Whisper") {
        sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
        loadNavigatorAudioRecording();
        $("#microphone_button").show();
    }
    if (sttProviderName == "Streaming") {
        sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
        $("#microphone_button").off('click');
        $("#microphone_button").hide();
    }
 }
 function onSttProviderChange() {
@@ -231,7 +320,7 @@ const defaultSettings = {
    messageMode: "append",
    messageMappingText: "",
    messageMapping: [],
-    messageMappingEnabled: false
+    messageMappingEnabled: false,
 }
 function loadSettings() {
@@ -344,8 +433,7 @@ $(document).ready(function () {
    addExtensionControls(); // No init dependencies
    loadSettings(); // Depends on Extension Controls and loadTtsProvider
    loadSttProvider(extension_settings.speech_recognition.currentProvider); // No dependencies
-
+    const wrapper = new ModuleWorkerWrapper(moduleWorker);
-    //const wrapper = new ModuleWorkerWrapper(moduleWorker);
+    setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things
-    //setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things
+    moduleWorker();
    //moduleWorker();
 })
--- a/public/scripts/extensions/speech-recognition/streaming.js
+++ b/public/scripts/extensions/speech-recognition/streaming.js
@@ -0,0 +1,102 @@
 import { getApiUrl, doExtrasFetch, modules } from "../../extensions.js";
 export { StreamingSttProvider }
 const DEBUG_PREFIX = "<Speech Recognition module (streaming)> "
 class StreamingSttProvider {
    //########//
    // Config //
    //########//
    settings
    defaultSettings = {
        triggerWordsText: "",
        triggerWords : [],
        triggerWordsEnabled : false,
        debug : false,
    }
    get settingsHtml() {
        let html = '\
        <div id="speech_recognition_streaming_trigger_words_div">\
        <span>Trigger words</span>\
        <textarea id="speech_recognition_streaming_trigger_words" class="text_pole textarea_compact" type="text" rows="4" placeholder="Enter comma separated words that triggers new message, example:\nhey, hey aqua, record, listen"></textarea>\
        <label class="checkbox_label" for="speech_recognition_streaming_trigger_words_enabled">\
            <input type="checkbox" id="speech_recognition_streaming_trigger_words_enabled" name="speech_recognition_trigger_words_enabled">\
            <small>Enable trigger words</small>\
        </label>\
        <label class="checkbox_label" for="speech_recognition_streaming_debug">\
            <input type="checkbox" id="speech_recognition_streaming_debug" name="speech_recognition_streaming_debug">\
            <small>Enable debug pop ups</small>\
        </label>\
        </div>\
        '
        return html
    }
    onSettingsChange() {
        this.settings.triggerWordsText = $('#speech_recognition_streaming_trigger_words').val();
        let array = $('#speech_recognition_streaming_trigger_words').val().split(",");
        array = array.map(element => {return element.trim().toLowerCase();});
        array = array.filter((str) => str !== '');
        this.settings.triggerWords = array;
        this.settings.triggerWordsEnabled = $("#speech_recognition_streaming_trigger_words_enabled").is(':checked');
        this.settings.debug = $("#speech_recognition_streaming_debug").is(':checked');
        console.debug(DEBUG_PREFIX+" Updated settings: ", this.settings);
        this.loadSettings(this.settings);
    }
    loadSettings(settings) {
        // Populate Provider UI given input settings
        if (Object.keys(settings).length == 0) {
            console.debug(DEBUG_PREFIX+"Using default Whisper STT extension settings")
        }
        // Only accept keys defined in defaultSettings
        this.settings = this.defaultSettings
        for (const key in settings){
            if (key in this.settings){
                this.settings[key] = settings[key]
            } else {
                throw `Invalid setting passed to STT extension: ${key}`
            }
        }
        $("#speech_recognition_streaming_trigger_words").val(this.settings.triggerWordsText);
        $("#speech_recognition_streaming_trigger_words_enabled").prop('checked',this.settings.triggerWordsEnabled);
        $("#speech_recognition_streaming_debug").prop('checked',this.settings.debug);
        console.debug(DEBUG_PREFIX+"streaming STT settings loaded")
    }
    async getUserMessage() {
        // Return if module is not loaded
        if (!modules.includes('streaming-stt')) {
            console.debug(DEBUG_PREFIX+"Module streaming-stt must be activated in Sillytavern Extras for streaming user voice.")
            return "";
        }
        const url = new URL(getApiUrl());
        url.pathname = '/api/speech-recognition/streaming/record-and-transcript';
        const apiResult = await doExtrasFetch(url, {
            method: 'POST',
            headers: {
                'Content-Type': 'application/json',
                'Bypass-Tunnel-Reminder': 'bypass',
            },
            body: JSON.stringify({ text: "" }),
        });
        if (!apiResult.ok) {
            toastr.error(apiResult.statusText, DEBUG_PREFIX+'STT Generation Failed  (streaming)', { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true });
            throw new Error(`HTTP ${apiResult.status}: ${await apiResult.text()}`);
        }
        const data = await apiResult.json();
        return data.transcript;
    }
 }