mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-02-01 11:56:48 +01:00
Restored streaming mode as a new provider "Streaming", recording is done on server side, voice detection with vosk and transcript with whisper.
This commit is contained in:
parent
8aff89de30
commit
192c82b180
@ -8,16 +8,21 @@ import { getContext, getApiUrl, modules, extension_settings, ModuleWorkerWrapper
|
|||||||
import { VoskSttProvider } from './vosk.js'
|
import { VoskSttProvider } from './vosk.js'
|
||||||
import { WhisperSttProvider } from './whisper.js'
|
import { WhisperSttProvider } from './whisper.js'
|
||||||
import { BrowserSttProvider } from './browser.js'
|
import { BrowserSttProvider } from './browser.js'
|
||||||
|
import { StreamingSttProvider } from './streaming.js'
|
||||||
export { MODULE_NAME };
|
export { MODULE_NAME };
|
||||||
|
|
||||||
const MODULE_NAME = 'Speech Recognition';
|
const MODULE_NAME = 'Speech Recognition';
|
||||||
const DEBUG_PREFIX = "<Speech Recognition module> "
|
const DEBUG_PREFIX = "<Speech Recognition module> "
|
||||||
|
const UPDATE_INTERVAL = 100;
|
||||||
|
|
||||||
|
let inApiCall = false;
|
||||||
|
|
||||||
let sttProviders = {
|
let sttProviders = {
|
||||||
None: null,
|
None: null,
|
||||||
Browser: BrowserSttProvider,
|
Browser: BrowserSttProvider,
|
||||||
Whisper: WhisperSttProvider,
|
Whisper: WhisperSttProvider,
|
||||||
Vosk: VoskSttProvider,
|
Vosk: VoskSttProvider,
|
||||||
|
Streaming: StreamingSttProvider,
|
||||||
}
|
}
|
||||||
|
|
||||||
let sttProvider = null
|
let sttProvider = null
|
||||||
@ -27,6 +32,82 @@ let audioRecording = false
|
|||||||
const constraints = { audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } };
|
const constraints = { audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } };
|
||||||
let audioChunks = [];
|
let audioChunks = [];
|
||||||
|
|
||||||
|
async function moduleWorker() {
|
||||||
|
if (sttProviderName != "Streaming") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// API is busy
|
||||||
|
if (inApiCall) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
inApiCall = true;
|
||||||
|
const userMessageOriginal = await sttProvider.getUserMessage();
|
||||||
|
let userMessageFormatted = userMessageOriginal.trim();
|
||||||
|
|
||||||
|
if (userMessageFormatted.length > 0)
|
||||||
|
{
|
||||||
|
console.debug(DEBUG_PREFIX+"recorded transcript: \""+userMessageFormatted+"\"");
|
||||||
|
|
||||||
|
let userMessageLower = userMessageFormatted.toLowerCase();
|
||||||
|
// remove punctuation
|
||||||
|
let userMessageRaw = userMessageLower.replace(/[^\w\s\']|_/g, "").replace(/\s+/g, " ");
|
||||||
|
|
||||||
|
console.debug(DEBUG_PREFIX+"raw transcript:",userMessageRaw);
|
||||||
|
|
||||||
|
// Detect trigger words
|
||||||
|
let messageStart = -1;
|
||||||
|
|
||||||
|
if (extension_settings.speech_recognition.Streaming.triggerWordsEnabled) {
|
||||||
|
|
||||||
|
for (const triggerWord of extension_settings.speech_recognition.Streaming.triggerWords) {
|
||||||
|
const triggerPos = userMessageRaw.indexOf(triggerWord.toLowerCase());
|
||||||
|
|
||||||
|
// Trigger word not found or not starting message and just a substring
|
||||||
|
if (triggerPos == -1){ // | (triggerPos > 0 & userMessageFormatted[triggerPos-1] != " ")) {
|
||||||
|
console.debug(DEBUG_PREFIX+"trigger word not found: ", triggerWord);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
console.debug(DEBUG_PREFIX+"Found trigger word: ", triggerWord, " at index ", triggerPos);
|
||||||
|
if (triggerPos < messageStart | messageStart == -1) { // & (triggerPos + triggerWord.length) < userMessageFormatted.length)) {
|
||||||
|
messageStart = triggerPos; // + triggerWord.length + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
messageStart = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (messageStart == -1) {
|
||||||
|
console.debug(DEBUG_PREFIX+"message ignored, no trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\"");
|
||||||
|
if (extension_settings.speech_recognition.Streaming.debug) {
|
||||||
|
toastr.info(
|
||||||
|
"No trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\"",
|
||||||
|
DEBUG_PREFIX+"message ignored.",
|
||||||
|
{ timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true },
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
userMessageFormatted = userMessageFormatted.substring(messageStart);
|
||||||
|
processTranscript(userMessageFormatted);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
console.debug(DEBUG_PREFIX+"Received empty transcript, ignored");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (error) {
|
||||||
|
console.debug(error);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
inApiCall = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function processTranscript(transcript) {
|
async function processTranscript(transcript) {
|
||||||
try {
|
try {
|
||||||
const transcriptOriginal = transcript;
|
const transcriptOriginal = transcript;
|
||||||
@ -198,13 +279,21 @@ function loadSttProvider(provider) {
|
|||||||
if (sttProviderName == "Browser") {
|
if (sttProviderName == "Browser") {
|
||||||
sttProvider.processTranscriptFunction = processTranscript;
|
sttProvider.processTranscriptFunction = processTranscript;
|
||||||
sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
|
sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
|
||||||
}
|
|
||||||
else {
|
|
||||||
sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
|
|
||||||
loadNavigatorAudioRecording();
|
|
||||||
|
|
||||||
$("#microphone_button").show();
|
$("#microphone_button").show();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sttProviderName == "Vosk" | sttProviderName == "Whisper") {
|
||||||
|
sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
|
||||||
|
loadNavigatorAudioRecording();
|
||||||
|
$("#microphone_button").show();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sttProviderName == "Streaming") {
|
||||||
|
sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
|
||||||
|
$("#microphone_button").off('click');
|
||||||
|
$("#microphone_button").hide();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function onSttProviderChange() {
|
function onSttProviderChange() {
|
||||||
@ -231,7 +320,7 @@ const defaultSettings = {
|
|||||||
messageMode: "append",
|
messageMode: "append",
|
||||||
messageMappingText: "",
|
messageMappingText: "",
|
||||||
messageMapping: [],
|
messageMapping: [],
|
||||||
messageMappingEnabled: false
|
messageMappingEnabled: false,
|
||||||
}
|
}
|
||||||
|
|
||||||
function loadSettings() {
|
function loadSettings() {
|
||||||
@ -344,8 +433,7 @@ $(document).ready(function () {
|
|||||||
addExtensionControls(); // No init dependencies
|
addExtensionControls(); // No init dependencies
|
||||||
loadSettings(); // Depends on Extension Controls and loadTtsProvider
|
loadSettings(); // Depends on Extension Controls and loadTtsProvider
|
||||||
loadSttProvider(extension_settings.speech_recognition.currentProvider); // No dependencies
|
loadSttProvider(extension_settings.speech_recognition.currentProvider); // No dependencies
|
||||||
|
const wrapper = new ModuleWorkerWrapper(moduleWorker);
|
||||||
//const wrapper = new ModuleWorkerWrapper(moduleWorker);
|
setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things
|
||||||
//setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things
|
moduleWorker();
|
||||||
//moduleWorker();
|
|
||||||
})
|
})
|
||||||
|
102
public/scripts/extensions/speech-recognition/streaming.js
Normal file
102
public/scripts/extensions/speech-recognition/streaming.js
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
import { getApiUrl, doExtrasFetch, modules } from "../../extensions.js";
|
||||||
|
export { StreamingSttProvider }
|
||||||
|
|
||||||
|
const DEBUG_PREFIX = "<Speech Recognition module (streaming)> "
|
||||||
|
|
||||||
|
class StreamingSttProvider {
|
||||||
|
//########//
|
||||||
|
// Config //
|
||||||
|
//########//
|
||||||
|
|
||||||
|
settings
|
||||||
|
|
||||||
|
defaultSettings = {
|
||||||
|
triggerWordsText: "",
|
||||||
|
triggerWords : [],
|
||||||
|
triggerWordsEnabled : false,
|
||||||
|
debug : false,
|
||||||
|
}
|
||||||
|
|
||||||
|
get settingsHtml() {
|
||||||
|
let html = '\
|
||||||
|
<div id="speech_recognition_streaming_trigger_words_div">\
|
||||||
|
<span>Trigger words</span>\
|
||||||
|
<textarea id="speech_recognition_streaming_trigger_words" class="text_pole textarea_compact" type="text" rows="4" placeholder="Enter comma separated words that triggers new message, example:\nhey, hey aqua, record, listen"></textarea>\
|
||||||
|
<label class="checkbox_label" for="speech_recognition_streaming_trigger_words_enabled">\
|
||||||
|
<input type="checkbox" id="speech_recognition_streaming_trigger_words_enabled" name="speech_recognition_trigger_words_enabled">\
|
||||||
|
<small>Enable trigger words</small>\
|
||||||
|
</label>\
|
||||||
|
<label class="checkbox_label" for="speech_recognition_streaming_debug">\
|
||||||
|
<input type="checkbox" id="speech_recognition_streaming_debug" name="speech_recognition_streaming_debug">\
|
||||||
|
<small>Enable debug pop ups</small>\
|
||||||
|
</label>\
|
||||||
|
</div>\
|
||||||
|
'
|
||||||
|
return html
|
||||||
|
}
|
||||||
|
|
||||||
|
onSettingsChange() {
|
||||||
|
this.settings.triggerWordsText = $('#speech_recognition_streaming_trigger_words').val();
|
||||||
|
let array = $('#speech_recognition_streaming_trigger_words').val().split(",");
|
||||||
|
array = array.map(element => {return element.trim().toLowerCase();});
|
||||||
|
array = array.filter((str) => str !== '');
|
||||||
|
this.settings.triggerWords = array;
|
||||||
|
this.settings.triggerWordsEnabled = $("#speech_recognition_streaming_trigger_words_enabled").is(':checked');
|
||||||
|
this.settings.debug = $("#speech_recognition_streaming_debug").is(':checked');
|
||||||
|
console.debug(DEBUG_PREFIX+" Updated settings: ", this.settings);
|
||||||
|
this.loadSettings(this.settings);
|
||||||
|
}
|
||||||
|
|
||||||
|
loadSettings(settings) {
|
||||||
|
// Populate Provider UI given input settings
|
||||||
|
if (Object.keys(settings).length == 0) {
|
||||||
|
console.debug(DEBUG_PREFIX+"Using default Whisper STT extension settings")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only accept keys defined in defaultSettings
|
||||||
|
this.settings = this.defaultSettings
|
||||||
|
|
||||||
|
for (const key in settings){
|
||||||
|
if (key in this.settings){
|
||||||
|
this.settings[key] = settings[key]
|
||||||
|
} else {
|
||||||
|
throw `Invalid setting passed to STT extension: ${key}`
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$("#speech_recognition_streaming_trigger_words").val(this.settings.triggerWordsText);
|
||||||
|
$("#speech_recognition_streaming_trigger_words_enabled").prop('checked',this.settings.triggerWordsEnabled);
|
||||||
|
$("#speech_recognition_streaming_debug").prop('checked',this.settings.debug);
|
||||||
|
|
||||||
|
console.debug(DEBUG_PREFIX+"streaming STT settings loaded")
|
||||||
|
}
|
||||||
|
|
||||||
|
async getUserMessage() {
|
||||||
|
// Return if module is not loaded
|
||||||
|
if (!modules.includes('streaming-stt')) {
|
||||||
|
console.debug(DEBUG_PREFIX+"Module streaming-stt must be activated in Sillytavern Extras for streaming user voice.")
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
const url = new URL(getApiUrl());
|
||||||
|
url.pathname = '/api/speech-recognition/streaming/record-and-transcript';
|
||||||
|
|
||||||
|
const apiResult = await doExtrasFetch(url, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Bypass-Tunnel-Reminder': 'bypass',
|
||||||
|
},
|
||||||
|
body: JSON.stringify({ text: "" }),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!apiResult.ok) {
|
||||||
|
toastr.error(apiResult.statusText, DEBUG_PREFIX+'STT Generation Failed (streaming)', { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true });
|
||||||
|
throw new Error(`HTTP ${apiResult.status}: ${await apiResult.text()}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = await apiResult.json();
|
||||||
|
return data.transcript;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user