2023-07-27 19:29:36 +02:00
/ *
TODO :
- try pseudo streaming audio by just sending chunk every X seconds and asking VOSK if it is full text .
* /
import { saveSettingsDebounced } from "../../../script.js" ;
import { getContext , getApiUrl , modules , extension _settings , ModuleWorkerWrapper , doExtrasFetch } from "../../extensions.js" ;
import { VoskSttProvider } from './vosk.js'
import { WhisperSttProvider } from './whisper.js'
import { BrowserSttProvider } from './browser.js'
2023-07-31 18:47:33 +02:00
import { StreamingSttProvider } from './streaming.js'
2023-07-27 19:29:36 +02:00
export { MODULE _NAME } ;
2023-07-20 19:32:15 +02:00
2023-07-27 19:29:36 +02:00
const MODULE _NAME = 'Speech Recognition' ;
const DEBUG _PREFIX = "<Speech Recognition module> "
2023-07-31 18:47:33 +02:00
const UPDATE _INTERVAL = 100 ;
let inApiCall = false ;
2023-07-27 19:29:36 +02:00
let sttProviders = {
None : null ,
Browser : BrowserSttProvider ,
Whisper : WhisperSttProvider ,
Vosk : VoskSttProvider ,
2023-07-31 18:47:33 +02:00
Streaming : StreamingSttProvider ,
2023-07-20 19:32:15 +02:00
}
2023-07-27 19:29:36 +02:00
let sttProvider = null
let sttProviderName = "None"
2023-07-20 19:32:15 +02:00
2023-07-27 19:29:36 +02:00
let audioRecording = false
const constraints = { audio : { sampleSize : 16 , channelCount : 1 , sampleRate : 16000 } } ;
let audioChunks = [ ] ;
2023-07-20 19:32:15 +02:00
2023-07-31 18:47:33 +02:00
async function moduleWorker ( ) {
if ( sttProviderName != "Streaming" ) {
return ;
}
// API is busy
if ( inApiCall ) {
return ;
}
try {
inApiCall = true ;
const userMessageOriginal = await sttProvider . getUserMessage ( ) ;
let userMessageFormatted = userMessageOriginal . trim ( ) ;
if ( userMessageFormatted . length > 0 )
{
console . debug ( DEBUG _PREFIX + "recorded transcript: \"" + userMessageFormatted + "\"" ) ;
let userMessageLower = userMessageFormatted . toLowerCase ( ) ;
// remove punctuation
let userMessageRaw = userMessageLower . replace ( /[^\w\s\']|_/g , "" ) . replace ( /\s+/g , " " ) ;
console . debug ( DEBUG _PREFIX + "raw transcript:" , userMessageRaw ) ;
// Detect trigger words
let messageStart = - 1 ;
if ( extension _settings . speech _recognition . Streaming . triggerWordsEnabled ) {
for ( const triggerWord of extension _settings . speech _recognition . Streaming . triggerWords ) {
const triggerPos = userMessageRaw . indexOf ( triggerWord . toLowerCase ( ) ) ;
// Trigger word not found or not starting message and just a substring
if ( triggerPos == - 1 ) { // | (triggerPos > 0 & userMessageFormatted[triggerPos-1] != " ")) {
console . debug ( DEBUG _PREFIX + "trigger word not found: " , triggerWord ) ;
}
else {
console . debug ( DEBUG _PREFIX + "Found trigger word: " , triggerWord , " at index " , triggerPos ) ;
if ( triggerPos < messageStart | messageStart == - 1 ) { // & (triggerPos + triggerWord.length) < userMessageFormatted.length)) {
messageStart = triggerPos ; // + triggerWord.length + 1;
}
}
}
} else {
messageStart = 0 ;
}
if ( messageStart == - 1 ) {
console . debug ( DEBUG _PREFIX + "message ignored, no trigger word preceding a message. Voice transcript: \"" + userMessageOriginal + "\"" ) ;
if ( extension _settings . speech _recognition . Streaming . debug ) {
toastr . info (
"No trigger word preceding a message. Voice transcript: \"" + userMessageOriginal + "\"" ,
DEBUG _PREFIX + "message ignored." ,
{ timeOut : 10000 , extendedTimeOut : 20000 , preventDuplicates : true } ,
) ;
}
}
else {
userMessageFormatted = userMessageFormatted . substring ( messageStart ) ;
processTranscript ( userMessageFormatted ) ;
}
}
else
{
console . debug ( DEBUG _PREFIX + "Received empty transcript, ignored" ) ;
}
}
catch ( error ) {
console . debug ( error ) ;
}
finally {
inApiCall = false ;
}
}
2023-07-27 19:29:36 +02:00
async function processTranscript ( transcript ) {
try {
const transcriptOriginal = transcript ;
let transcriptFormatted = transcriptOriginal . trim ( ) ;
2023-07-20 19:32:15 +02:00
2023-07-27 19:29:36 +02:00
if ( transcriptFormatted . length > 0 )
{
console . debug ( DEBUG _PREFIX + "recorded transcript: \"" + transcriptFormatted + "\"" ) ;
const messageMode = extension _settings . speech _recognition . messageMode ;
console . debug ( DEBUG _PREFIX + "mode: " + messageMode ) ;
2023-07-20 19:32:15 +02:00
2023-07-27 19:29:36 +02:00
let transcriptLower = transcriptFormatted . toLowerCase ( )
// remove punctuation
let transcriptRaw = transcriptLower . replace ( /[^\w\s\']|_/g , "" ) . replace ( /\s+/g , " " ) ;
2023-07-20 19:32:15 +02:00
2023-07-27 19:29:36 +02:00
// Check message mapping
if ( extension _settings . speech _recognition . messageMappingEnabled ) {
console . debug ( DEBUG _PREFIX + "Start searching message mapping into:" , transcriptRaw )
for ( const key in extension _settings . speech _recognition . messageMapping ) {
console . debug ( DEBUG _PREFIX + "message mapping searching: " , key , "=>" , extension _settings . speech _recognition . messageMapping [ key ] ) ;
if ( transcriptRaw . includes ( key ) ) {
var message = extension _settings . speech _recognition . messageMapping [ key ] ;
console . debug ( DEBUG _PREFIX + "message mapping found: " , key , "=>" , extension _settings . speech _recognition . messageMapping [ key ] ) ;
$ ( "#send_textarea" ) . val ( message ) ;
2023-07-20 19:32:15 +02:00
2023-07-27 19:29:36 +02:00
if ( messageMode == "auto_send" ) await getContext ( ) . generate ( ) ;
return ;
}
}
}
console . debug ( DEBUG _PREFIX + "no message mapping found, processing transcript as normal message" ) ;
switch ( messageMode ) {
case "auto_send" :
$ ( '#send_textarea' ) . val ( "" ) // clear message area to avoid double message
console . debug ( DEBUG _PREFIX + "Sending message" )
const context = getContext ( ) ;
const messageText = transcriptFormatted ;
const message = {
name : context . name1 ,
is _user : true ,
is _name : true ,
send _date : Date . now ( ) ,
mes : messageText ,
} ;
context . chat . push ( message ) ;
context . addOneMessage ( message ) ;
await context . generate ( ) ;
$ ( '#debug_output' ) . text ( "<SST-module DEBUG>: message sent: \"" + transcriptFormatted + "\"" ) ;
break ;
case "replace" :
console . debug ( DEBUG _PREFIX + "Replacing message" )
$ ( '#send_textarea' ) . val ( transcriptFormatted ) ;
break ;
case "append" :
console . debug ( DEBUG _PREFIX + "Appending message" )
$ ( '#send_textarea' ) . val ( $ ( '#send_textarea' ) . val ( ) + " " + transcriptFormatted ) ;
break ;
default :
console . debug ( DEBUG _PREFIX + "Not supported stt message mode: " + messageMode )
2023-07-20 19:32:15 +02:00
}
2023-07-27 19:29:36 +02:00
}
else
{
console . debug ( DEBUG _PREFIX + "Empty transcript, do nothing" ) ;
}
}
catch ( error ) {
console . debug ( error ) ;
}
}
function loadNavigatorAudioRecording ( ) {
if ( navigator . mediaDevices . getUserMedia ) {
console . debug ( DEBUG _PREFIX + ' getUserMedia supported by browser.' ) ;
let onSuccess = function ( stream ) {
const mediaRecorder = new MediaRecorder ( stream ) ;
$ ( "#microphone_button" ) . off ( 'click' ) . on ( "click" , function ( ) {
if ( ! audioRecording ) {
mediaRecorder . start ( ) ;
console . debug ( mediaRecorder . state ) ;
console . debug ( "recorder started" ) ;
audioRecording = true ;
$ ( "#microphone_button" ) . toggleClass ( 'fa-microphone fa-microphone-slash' ) ;
2023-07-20 19:32:15 +02:00
}
2023-07-27 19:29:36 +02:00
else {
mediaRecorder . stop ( ) ;
console . debug ( mediaRecorder . state ) ;
console . debug ( "recorder stopped" ) ;
audioRecording = false ;
$ ( "#microphone_button" ) . toggleClass ( 'fa-microphone fa-microphone-slash' ) ;
}
} ) ;
mediaRecorder . onstop = async function ( ) {
console . debug ( DEBUG _PREFIX + "data available after MediaRecorder.stop() called: " , audioChunks . length , " chunks" ) ;
const audioBlob = new Blob ( audioChunks , { type : "audio/wav; codecs=0" } ) ;
audioChunks = [ ] ;
const transcript = await sttProvider . processAudio ( audioBlob ) ;
// TODO: lock and release recording while processing?
console . debug ( DEBUG _PREFIX + "received transcript:" , transcript ) ;
processTranscript ( transcript ) ;
}
mediaRecorder . ondataavailable = function ( e ) {
audioChunks . push ( e . data ) ;
}
}
let onError = function ( err ) {
console . debug ( DEBUG _PREFIX + "The following error occured: " + err ) ;
}
navigator . mediaDevices . getUserMedia ( constraints ) . then ( onSuccess , onError ) ;
} else {
console . debug ( DEBUG _PREFIX + "getUserMedia not supported on your browser!" ) ;
toastr . error ( "getUserMedia not supported" , DEBUG _PREFIX + "not supported for your browser." , { timeOut : 10000 , extendedTimeOut : 20000 , preventDuplicates : true } ) ;
}
}
2023-07-20 19:32:15 +02:00
2023-07-27 19:29:36 +02:00
//##############//
// STT Provider //
//##############//
2023-07-20 19:32:15 +02:00
2023-07-27 19:29:36 +02:00
function loadSttProvider ( provider ) {
//Clear the current config and add new config
$ ( "#speech_recognition_provider_settings" ) . html ( "" ) ;
2023-07-20 19:32:15 +02:00
2023-07-27 19:29:36 +02:00
// Init provider references
extension _settings . speech _recognition . currentProvider = provider ;
sttProviderName = provider ;
2023-07-20 19:32:15 +02:00
2023-07-27 19:29:36 +02:00
if ( ! ( sttProviderName in extension _settings . speech _recognition ) ) {
console . warn ( ` Provider ${ sttProviderName } not in Extension Settings, initiatilizing provider in settings ` ) ;
extension _settings . speech _recognition [ sttProviderName ] = { } ;
}
$ ( '#speech_recognition_provider' ) . val ( sttProviderName ) ;
if ( sttProviderName == "None" ) {
$ ( "#microphone_button" ) . hide ( ) ;
$ ( "#speech_recognition_message_mode_div" ) . hide ( ) ;
$ ( "#speech_recognition_message_mapping_div" ) . hide ( ) ;
return ;
}
$ ( "#speech_recognition_message_mode_div" ) . show ( ) ;
$ ( "#speech_recognition_message_mapping_div" ) . show ( ) ;
sttProvider = new sttProviders [ sttProviderName ]
// Init provider settings
$ ( '#speech_recognition_provider_settings' ) . append ( sttProvider . settingsHtml ) ;
// Use microphone button as push to talk
if ( sttProviderName == "Browser" ) {
sttProvider . processTranscriptFunction = processTranscript ;
sttProvider . loadSettings ( extension _settings . speech _recognition [ sttProviderName ] ) ;
2023-07-31 18:47:33 +02:00
$ ( "#microphone_button" ) . show ( ) ;
2023-07-27 19:29:36 +02:00
}
2023-07-31 18:47:33 +02:00
if ( sttProviderName == "Vosk" | sttProviderName == "Whisper" ) {
2023-07-27 19:29:36 +02:00
sttProvider . loadSettings ( extension _settings . speech _recognition [ sttProviderName ] ) ;
loadNavigatorAudioRecording ( ) ;
$ ( "#microphone_button" ) . show ( ) ;
}
2023-07-31 18:47:33 +02:00
if ( sttProviderName == "Streaming" ) {
sttProvider . loadSettings ( extension _settings . speech _recognition [ sttProviderName ] ) ;
$ ( "#microphone_button" ) . off ( 'click' ) ;
$ ( "#microphone_button" ) . hide ( ) ;
}
2023-07-27 19:29:36 +02:00
}
function onSttProviderChange ( ) {
const sttProviderSelection = $ ( '#speech_recognition_provider' ) . val ( ) ;
loadSttProvider ( sttProviderSelection ) ;
saveSettingsDebounced ( ) ;
}
function onSttProviderSettingsInput ( ) {
sttProvider . onSettingsChange ( ) ;
// Persist changes to SillyTavern stt extension settings
extension _settings . speech _recognition [ sttProviderName ] = sttProvider . settings ;
saveSettingsDebounced ( ) ;
console . info ( ` Saved settings ${ sttProviderName } ${ JSON . stringify ( sttProvider . settings ) } ` ) ;
}
//#############################//
// Extension UI and Settings //
//#############################//
const defaultSettings = {
currentProvider : "None" ,
messageMode : "append" ,
messageMappingText : "" ,
messageMapping : [ ] ,
2023-07-31 18:47:33 +02:00
messageMappingEnabled : false ,
2023-07-27 19:29:36 +02:00
}
function loadSettings ( ) {
if ( Object . keys ( extension _settings . speech _recognition ) . length === 0 ) {
Object . assign ( extension _settings . speech _recognition , defaultSettings )
}
$ ( '#speech_recognition_enabled' ) . prop ( 'checked' , extension _settings . speech _recognition . enabled ) ;
$ ( '#speech_recognition_message_mode' ) . val ( extension _settings . speech _recognition . messageMode ) ;
if ( extension _settings . speech _recognition . messageMappingText . length > 0 ) {
$ ( '#speech_recognition_message_mapping' ) . val ( extension _settings . speech _recognition . messageMappingText ) ;
}
$ ( '#speech_recognition_message_mapping_enabled' ) . prop ( 'checked' , extension _settings . speech _recognition . messageMappingEnabled ) ;
}
async function onMessageModeChange ( ) {
extension _settings . speech _recognition . messageMode = $ ( '#speech_recognition_message_mode' ) . val ( ) ;
if ( sttProviderName != "Browser" & extension _settings . speech _recognition . messageMode == "auto_send" ) {
$ ( "#speech_recognition_wait_response_div" ) . show ( )
}
else {
$ ( "#speech_recognition_wait_response_div" ) . hide ( )
}
saveSettingsDebounced ( ) ;
}
async function onMessageMappingChange ( ) {
let array = $ ( '#speech_recognition_message_mapping' ) . val ( ) . split ( "," ) ;
array = array . map ( element => { return element . trim ( ) ; } ) ;
array = array . filter ( ( str ) => str !== '' ) ;
extension _settings . speech _recognition . messageMapping = { } ;
for ( const text of array ) {
if ( text . includes ( "=" ) ) {
const pair = text . toLowerCase ( ) . split ( "=" )
extension _settings . speech _recognition . messageMapping [ pair [ 0 ] . trim ( ) ] = pair [ 1 ] . trim ( )
console . debug ( DEBUG _PREFIX + "Added mapping" , pair [ 0 ] , "=>" , extension _settings . speech _recognition . messageMapping [ pair [ 0 ] ] ) ;
}
else {
console . debug ( DEBUG _PREFIX + "Wrong syntax for message mapping, no '=' found in:" , text ) ;
}
}
$ ( "#speech_recognition_message_mapping_status" ) . text ( "Message mapping updated to: " + JSON . stringify ( extension _settings . speech _recognition . messageMapping ) )
console . debug ( DEBUG _PREFIX + "Updated message mapping" , extension _settings . speech _recognition . messageMapping ) ;
extension _settings . speech _recognition . messageMappingText = $ ( '#speech_recognition_message_mapping' ) . val ( )
saveSettingsDebounced ( ) ;
}
async function onMessageMappingEnabledClick ( ) {
extension _settings . speech _recognition . messageMappingEnabled = $ ( '#speech_recognition_message_mapping_enabled' ) . is ( ':checked' ) ;
saveSettingsDebounced ( )
}
$ ( document ) . ready ( function ( ) {
function addExtensionControls ( ) {
const settingsHtml = `
< div id = "speech_recognition_settings" >
< div class = "inline-drawer" >
< div class = "inline-drawer-toggle inline-drawer-header" >
< b > Speech Recognition < / b >
< div class = "inline-drawer-icon fa-solid fa-circle-chevron-down down" > < / d i v >
< / d i v >
< div class = "inline-drawer-content" >
< div >
< span > Select Speech - to - text Provider < / s p a n > < / b r >
< select id = "speech_recognition_provider" >
< / s e l e c t >
< / d i v >
< div id = "speech_recognition_message_mode_div" >
< span > Message Mode < / s p a n > < / b r >
< select id = "speech_recognition_message_mode" >
< option value = "append" > Append < / o p t i o n >
< option value = "replace" > Replace < / o p t i o n >
< option value = "auto_send" > Auto send < / o p t i o n >
< / s e l e c t >
< / d i v >
< div id = "speech_recognition_message_mapping_div" >
< span > Message Mapping < / s p a n >
< textarea id = "speech_recognition_message_mapping" class = "text_pole textarea_compact" type = "text" rows = "4" placeholder = "Enter comma separated phrases mapping, example:\ncommand delete = /del 2,\nslash delete = /del 2,\nsystem roll = /roll 2d6,\nhey continue = /continue" > < / t e x t a r e a >
< span id = "speech_recognition_message_mapping_status" > < / s p a n >
< label class = "checkbox_label" for = "speech_recognition_message_mapping_enabled" >
< input type = "checkbox" id = "speech_recognition_message_mapping_enabled" name = "speech_recognition_message_mapping_enabled" >
< small > Enable messages mapping < / s m a l l >
< / l a b e l >
< / d i v >
< form id = "speech_recognition_provider_settings" class = "inline-drawer-content" >
< / f o r m >
< / d i v >
< / d i v >
< / d i v >
` ;
$ ( '#extensions_settings' ) . append ( settingsHtml ) ;
$ ( '#speech_recognition_provider_settings' ) . on ( 'input' , onSttProviderSettingsInput ) ;
for ( const provider in sttProviders ) {
$ ( '#speech_recognition_provider' ) . append ( $ ( "<option />" ) . val ( provider ) . text ( provider ) ) ;
console . debug ( DEBUG _PREFIX + "added option " + provider ) ;
}
$ ( '#speech_recognition_provider' ) . on ( 'change' , onSttProviderChange ) ;
$ ( '#speech_recognition_message_mode' ) . on ( 'change' , onMessageModeChange ) ;
$ ( '#speech_recognition_message_mapping' ) . on ( 'change' , onMessageMappingChange ) ;
$ ( '#speech_recognition_message_mapping_enabled' ) . on ( 'click' , onMessageMappingEnabledClick ) ;
const $button = $ ( '<div id="microphone_button" class="fa-solid fa-microphone speech-toggle" title="Click to speak"></div>' ) ;
$ ( '#send_but_sheld' ) . prepend ( $button ) ;
}
addExtensionControls ( ) ; // No init dependencies
loadSettings ( ) ; // Depends on Extension Controls and loadTtsProvider
loadSttProvider ( extension _settings . speech _recognition . currentProvider ) ; // No dependencies
2023-07-31 18:47:33 +02:00
const wrapper = new ModuleWorkerWrapper ( moduleWorker ) ;
setInterval ( wrapper . update . bind ( wrapper ) , UPDATE _INTERVAL ) ; // Init depends on all the things
moduleWorker ( ) ;
2023-07-27 19:29:36 +02:00
} )