440 lines
18 KiB
JavaScript
440 lines
18 KiB
JavaScript
/*
|
|
TODO:
|
|
- try pseudo streaming audio by just sending chunk every X seconds and asking VOSK if it is full text.
|
|
*/
|
|
|
|
import { saveSettingsDebounced } from "../../../script.js";
|
|
import { getContext, getApiUrl, modules, extension_settings, ModuleWorkerWrapper, doExtrasFetch } from "../../extensions.js";
|
|
import { VoskSttProvider } from './vosk.js'
|
|
import { WhisperSttProvider } from './whisper.js'
|
|
import { BrowserSttProvider } from './browser.js'
|
|
import { StreamingSttProvider } from './streaming.js'
|
|
export { MODULE_NAME };
|
|
|
|
const MODULE_NAME = 'Speech Recognition';
|
|
const DEBUG_PREFIX = "<Speech Recognition module> "
|
|
const UPDATE_INTERVAL = 100;
|
|
|
|
let inApiCall = false;
|
|
|
|
let sttProviders = {
|
|
None: null,
|
|
Browser: BrowserSttProvider,
|
|
Whisper: WhisperSttProvider,
|
|
Vosk: VoskSttProvider,
|
|
Streaming: StreamingSttProvider,
|
|
}
|
|
|
|
let sttProvider = null
|
|
let sttProviderName = "None"
|
|
|
|
let audioRecording = false
|
|
const constraints = { audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } };
|
|
let audioChunks = [];
|
|
|
|
async function moduleWorker() {
|
|
if (sttProviderName != "Streaming") {
|
|
return;
|
|
}
|
|
|
|
// API is busy
|
|
if (inApiCall) {
|
|
return;
|
|
}
|
|
|
|
try {
|
|
inApiCall = true;
|
|
const userMessageOriginal = await sttProvider.getUserMessage();
|
|
let userMessageFormatted = userMessageOriginal.trim();
|
|
|
|
if (userMessageFormatted.length > 0)
|
|
{
|
|
console.debug(DEBUG_PREFIX+"recorded transcript: \""+userMessageFormatted+"\"");
|
|
|
|
let userMessageLower = userMessageFormatted.toLowerCase();
|
|
// remove punctuation
|
|
let userMessageRaw = userMessageLower.replace(/[^\w\s\']|_/g, "").replace(/\s+/g, " ");
|
|
|
|
console.debug(DEBUG_PREFIX+"raw transcript:",userMessageRaw);
|
|
|
|
// Detect trigger words
|
|
let messageStart = -1;
|
|
|
|
if (extension_settings.speech_recognition.Streaming.triggerWordsEnabled) {
|
|
|
|
for (const triggerWord of extension_settings.speech_recognition.Streaming.triggerWords) {
|
|
const triggerPos = userMessageRaw.indexOf(triggerWord.toLowerCase());
|
|
|
|
// Trigger word not found or not starting message and just a substring
|
|
if (triggerPos == -1){ // | (triggerPos > 0 & userMessageFormatted[triggerPos-1] != " ")) {
|
|
console.debug(DEBUG_PREFIX+"trigger word not found: ", triggerWord);
|
|
}
|
|
else {
|
|
console.debug(DEBUG_PREFIX+"Found trigger word: ", triggerWord, " at index ", triggerPos);
|
|
if (triggerPos < messageStart | messageStart == -1) { // & (triggerPos + triggerWord.length) < userMessageFormatted.length)) {
|
|
messageStart = triggerPos; // + triggerWord.length + 1;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
messageStart = 0;
|
|
}
|
|
|
|
if (messageStart == -1) {
|
|
console.debug(DEBUG_PREFIX+"message ignored, no trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\"");
|
|
if (extension_settings.speech_recognition.Streaming.debug) {
|
|
toastr.info(
|
|
"No trigger word preceding a message. Voice transcript: \""+ userMessageOriginal +"\"",
|
|
DEBUG_PREFIX+"message ignored.",
|
|
{ timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true },
|
|
);
|
|
}
|
|
}
|
|
else{
|
|
userMessageFormatted = userMessageFormatted.substring(messageStart);
|
|
processTranscript(userMessageFormatted);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
console.debug(DEBUG_PREFIX+"Received empty transcript, ignored");
|
|
}
|
|
}
|
|
catch (error) {
|
|
console.debug(error);
|
|
}
|
|
finally {
|
|
inApiCall = false;
|
|
}
|
|
}
|
|
|
|
async function processTranscript(transcript) {
|
|
try {
|
|
const transcriptOriginal = transcript;
|
|
let transcriptFormatted = transcriptOriginal.trim();
|
|
|
|
if (transcriptFormatted.length > 0)
|
|
{
|
|
console.debug(DEBUG_PREFIX+"recorded transcript: \""+transcriptFormatted+"\"");
|
|
const messageMode = extension_settings.speech_recognition.messageMode;
|
|
console.debug(DEBUG_PREFIX+"mode: "+messageMode);
|
|
|
|
let transcriptLower = transcriptFormatted.toLowerCase()
|
|
// remove punctuation
|
|
let transcriptRaw = transcriptLower.replace(/[^\w\s\']|_/g, "").replace(/\s+/g, " ");
|
|
|
|
// Check message mapping
|
|
if (extension_settings.speech_recognition.messageMappingEnabled) {
|
|
console.debug(DEBUG_PREFIX+"Start searching message mapping into:",transcriptRaw)
|
|
for (const key in extension_settings.speech_recognition.messageMapping) {
|
|
console.debug(DEBUG_PREFIX+"message mapping searching: ", key,"=>",extension_settings.speech_recognition.messageMapping[key]);
|
|
if (transcriptRaw.includes(key)) {
|
|
var message = extension_settings.speech_recognition.messageMapping[key];
|
|
console.debug(DEBUG_PREFIX+"message mapping found: ", key,"=>",extension_settings.speech_recognition.messageMapping[key]);
|
|
$("#send_textarea").val(message);
|
|
|
|
if (messageMode == "auto_send") await getContext().generate();
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
console.debug(DEBUG_PREFIX+"no message mapping found, processing transcript as normal message");
|
|
|
|
switch (messageMode) {
|
|
case "auto_send":
|
|
$('#send_textarea').val("") // clear message area to avoid double message
|
|
|
|
console.debug(DEBUG_PREFIX+"Sending message")
|
|
const context = getContext();
|
|
const messageText = transcriptFormatted;
|
|
const message = {
|
|
name: context.name1,
|
|
is_user: true,
|
|
is_name: true,
|
|
send_date: Date.now(),
|
|
mes: messageText,
|
|
};
|
|
context.chat.push(message);
|
|
context.addOneMessage(message);
|
|
|
|
await context.generate();
|
|
|
|
$('#debug_output').text("<SST-module DEBUG>: message sent: \""+ transcriptFormatted +"\"");
|
|
break;
|
|
|
|
case "replace":
|
|
console.debug(DEBUG_PREFIX+"Replacing message")
|
|
$('#send_textarea').val(transcriptFormatted);
|
|
break;
|
|
|
|
case "append":
|
|
console.debug(DEBUG_PREFIX+"Appending message")
|
|
$('#send_textarea').val($('#send_textarea').val()+" "+transcriptFormatted);
|
|
break;
|
|
|
|
default:
|
|
console.debug(DEBUG_PREFIX+"Not supported stt message mode: "+messageMode)
|
|
|
|
}
|
|
}
|
|
else
|
|
{
|
|
console.debug(DEBUG_PREFIX+"Empty transcript, do nothing");
|
|
}
|
|
}
|
|
catch (error) {
|
|
console.debug(error);
|
|
}
|
|
}
|
|
|
|
function loadNavigatorAudioRecording() {
|
|
if (navigator.mediaDevices.getUserMedia) {
|
|
console.debug(DEBUG_PREFIX+' getUserMedia supported by browser.');
|
|
|
|
let onSuccess = function(stream) {
|
|
const mediaRecorder = new MediaRecorder(stream);
|
|
|
|
$("#microphone_button").off('click').on("click", function() {
|
|
if (!audioRecording) {
|
|
mediaRecorder.start();
|
|
console.debug(mediaRecorder.state);
|
|
console.debug("recorder started");
|
|
audioRecording = true;
|
|
$("#microphone_button").toggleClass('fa-microphone fa-microphone-slash');
|
|
}
|
|
else {
|
|
mediaRecorder.stop();
|
|
console.debug(mediaRecorder.state);
|
|
console.debug("recorder stopped");
|
|
audioRecording = false;
|
|
$("#microphone_button").toggleClass('fa-microphone fa-microphone-slash');
|
|
}
|
|
});
|
|
|
|
mediaRecorder.onstop = async function() {
|
|
console.debug(DEBUG_PREFIX+"data available after MediaRecorder.stop() called: ", audioChunks.length, " chunks");
|
|
const audioBlob = new Blob(audioChunks, { type: "audio/wav; codecs=0" });
|
|
audioChunks = [];
|
|
|
|
const transcript = await sttProvider.processAudio(audioBlob);
|
|
|
|
// TODO: lock and release recording while processing?
|
|
console.debug(DEBUG_PREFIX+"received transcript:", transcript);
|
|
processTranscript(transcript);
|
|
}
|
|
|
|
mediaRecorder.ondataavailable = function(e) {
|
|
audioChunks.push(e.data);
|
|
}
|
|
}
|
|
|
|
let onError = function(err) {
|
|
console.debug(DEBUG_PREFIX+"The following error occured: " + err);
|
|
}
|
|
|
|
navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
|
|
|
|
} else {
|
|
console.debug(DEBUG_PREFIX+"getUserMedia not supported on your browser!");
|
|
toastr.error("getUserMedia not supported", DEBUG_PREFIX+"not supported for your browser.", { timeOut: 10000, extendedTimeOut: 20000, preventDuplicates: true });
|
|
}
|
|
}
|
|
|
|
//##############//
|
|
// STT Provider //
|
|
//##############//
|
|
|
|
function loadSttProvider(provider) {
|
|
//Clear the current config and add new config
|
|
$("#speech_recognition_provider_settings").html("");
|
|
|
|
// Init provider references
|
|
extension_settings.speech_recognition.currentProvider = provider;
|
|
sttProviderName = provider;
|
|
|
|
if (!(sttProviderName in extension_settings.speech_recognition)) {
|
|
console.warn(`Provider ${sttProviderName} not in Extension Settings, initiatilizing provider in settings`);
|
|
extension_settings.speech_recognition[sttProviderName] = {};
|
|
}
|
|
|
|
$('#speech_recognition_provider').val(sttProviderName);
|
|
|
|
if (sttProviderName == "None") {
|
|
$("#microphone_button").hide();
|
|
$("#speech_recognition_message_mode_div").hide();
|
|
$("#speech_recognition_message_mapping_div").hide();
|
|
return;
|
|
}
|
|
|
|
$("#speech_recognition_message_mode_div").show();
|
|
$("#speech_recognition_message_mapping_div").show();
|
|
|
|
sttProvider = new sttProviders[sttProviderName]
|
|
|
|
// Init provider settings
|
|
$('#speech_recognition_provider_settings').append(sttProvider.settingsHtml);
|
|
|
|
// Use microphone button as push to talk
|
|
if (sttProviderName == "Browser") {
|
|
sttProvider.processTranscriptFunction = processTranscript;
|
|
sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
|
|
$("#microphone_button").show();
|
|
}
|
|
|
|
if (sttProviderName == "Vosk" | sttProviderName == "Whisper") {
|
|
sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
|
|
loadNavigatorAudioRecording();
|
|
$("#microphone_button").show();
|
|
}
|
|
|
|
if (sttProviderName == "Streaming") {
|
|
sttProvider.loadSettings(extension_settings.speech_recognition[sttProviderName]);
|
|
$("#microphone_button").off('click');
|
|
$("#microphone_button").hide();
|
|
}
|
|
|
|
}
|
|
|
|
function onSttProviderChange() {
|
|
const sttProviderSelection = $('#speech_recognition_provider').val();
|
|
loadSttProvider(sttProviderSelection);
|
|
saveSettingsDebounced();
|
|
}
|
|
|
|
function onSttProviderSettingsInput() {
|
|
sttProvider.onSettingsChange();
|
|
|
|
// Persist changes to SillyTavern stt extension settings
|
|
extension_settings.speech_recognition[sttProviderName] = sttProvider.settings;
|
|
saveSettingsDebounced();
|
|
console.info(`Saved settings ${sttProviderName} ${JSON.stringify(sttProvider.settings)}`);
|
|
}
|
|
|
|
//#############################//
|
|
// Extension UI and Settings //
|
|
//#############################//
|
|
|
|
const defaultSettings = {
|
|
currentProvider: "None",
|
|
messageMode: "append",
|
|
messageMappingText: "",
|
|
messageMapping: [],
|
|
messageMappingEnabled: false,
|
|
}
|
|
|
|
function loadSettings() {
|
|
if (Object.keys(extension_settings.speech_recognition).length === 0) {
|
|
Object.assign(extension_settings.speech_recognition, defaultSettings)
|
|
}
|
|
$('#speech_recognition_enabled').prop('checked',extension_settings.speech_recognition.enabled);
|
|
$('#speech_recognition_message_mode').val(extension_settings.speech_recognition.messageMode);
|
|
|
|
if (extension_settings.speech_recognition.messageMappingText.length > 0) {
|
|
$('#speech_recognition_message_mapping').val(extension_settings.speech_recognition.messageMappingText);
|
|
}
|
|
|
|
$('#speech_recognition_message_mapping_enabled').prop('checked',extension_settings.speech_recognition.messageMappingEnabled);
|
|
}
|
|
|
|
async function onMessageModeChange() {
|
|
extension_settings.speech_recognition.messageMode = $('#speech_recognition_message_mode').val();
|
|
|
|
if(sttProviderName != "Browser" & extension_settings.speech_recognition.messageMode == "auto_send") {
|
|
$("#speech_recognition_wait_response_div").show()
|
|
}
|
|
else {
|
|
$("#speech_recognition_wait_response_div").hide()
|
|
}
|
|
|
|
saveSettingsDebounced();
|
|
}
|
|
|
|
async function onMessageMappingChange() {
|
|
let array = $('#speech_recognition_message_mapping').val().split(",");
|
|
array = array.map(element => {return element.trim();});
|
|
array = array.filter((str) => str !== '');
|
|
extension_settings.speech_recognition.messageMapping = {};
|
|
for (const text of array) {
|
|
if (text.includes("=")) {
|
|
const pair = text.toLowerCase().split("=")
|
|
extension_settings.speech_recognition.messageMapping[pair[0].trim()] = pair[1].trim()
|
|
console.debug(DEBUG_PREFIX+"Added mapping", pair[0],"=>", extension_settings.speech_recognition.messageMapping[pair[0]]);
|
|
}
|
|
else {
|
|
console.debug(DEBUG_PREFIX+"Wrong syntax for message mapping, no '=' found in:", text);
|
|
}
|
|
}
|
|
|
|
$("#speech_recognition_message_mapping_status").text("Message mapping updated to: "+JSON.stringify(extension_settings.speech_recognition.messageMapping))
|
|
console.debug(DEBUG_PREFIX+"Updated message mapping", extension_settings.speech_recognition.messageMapping);
|
|
extension_settings.speech_recognition.messageMappingText = $('#speech_recognition_message_mapping').val()
|
|
saveSettingsDebounced();
|
|
}
|
|
|
|
async function onMessageMappingEnabledClick() {
|
|
extension_settings.speech_recognition.messageMappingEnabled = $('#speech_recognition_message_mapping_enabled').is(':checked');
|
|
saveSettingsDebounced()
|
|
}
|
|
|
|
$(document).ready(function () {
|
|
function addExtensionControls() {
|
|
const settingsHtml = `
|
|
<div id="speech_recognition_settings">
|
|
<div class="inline-drawer">
|
|
<div class="inline-drawer-toggle inline-drawer-header">
|
|
<b>Speech Recognition</b>
|
|
<div class="inline-drawer-icon fa-solid fa-circle-chevron-down down"></div>
|
|
</div>
|
|
<div class="inline-drawer-content">
|
|
<div>
|
|
<span>Select Speech-to-text Provider</span> </br>
|
|
<select id="speech_recognition_provider">
|
|
</select>
|
|
</div>
|
|
<div id="speech_recognition_message_mode_div">
|
|
<span>Message Mode</span> </br>
|
|
<select id="speech_recognition_message_mode">
|
|
<option value="append">Append</option>
|
|
<option value="replace">Replace</option>
|
|
<option value="auto_send">Auto send</option>
|
|
</select>
|
|
</div>
|
|
<div id="speech_recognition_message_mapping_div">
|
|
<span>Message Mapping</span>
|
|
<textarea id="speech_recognition_message_mapping" class="text_pole textarea_compact" type="text" rows="4" placeholder="Enter comma separated phrases mapping, example:\ncommand delete = /del 2,\nslash delete = /del 2,\nsystem roll = /roll 2d6,\nhey continue = /continue"></textarea>
|
|
<span id="speech_recognition_message_mapping_status"></span>
|
|
<label class="checkbox_label" for="speech_recognition_message_mapping_enabled">
|
|
<input type="checkbox" id="speech_recognition_message_mapping_enabled" name="speech_recognition_message_mapping_enabled">
|
|
<small>Enable messages mapping</small>
|
|
</label>
|
|
</div>
|
|
<form id="speech_recognition_provider_settings" class="inline-drawer-content">
|
|
</form>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
`;
|
|
$('#extensions_settings').append(settingsHtml);
|
|
$('#speech_recognition_provider_settings').on('input', onSttProviderSettingsInput);
|
|
for (const provider in sttProviders) {
|
|
$('#speech_recognition_provider').append($("<option />").val(provider).text(provider));
|
|
console.debug(DEBUG_PREFIX+"added option "+provider);
|
|
}
|
|
$('#speech_recognition_provider').on('change', onSttProviderChange);
|
|
$('#speech_recognition_message_mode').on('change', onMessageModeChange);
|
|
$('#speech_recognition_message_mapping').on('change', onMessageMappingChange);
|
|
$('#speech_recognition_message_mapping_enabled').on('click', onMessageMappingEnabledClick);
|
|
|
|
const $button = $('<div id="microphone_button" class="fa-solid fa-microphone speech-toggle" title="Click to speak"></div>');
|
|
$('#send_but_sheld').prepend($button);
|
|
|
|
}
|
|
addExtensionControls(); // No init dependencies
|
|
loadSettings(); // Depends on Extension Controls and loadTtsProvider
|
|
loadSttProvider(extension_settings.speech_recognition.currentProvider); // No dependencies
|
|
const wrapper = new ModuleWorkerWrapper(moduleWorker);
|
|
setInterval(wrapper.update.bind(wrapper), UPDATE_INTERVAL); // Init depends on all the things
|
|
moduleWorker();
|
|
})
|