feature: 'kokoro-js' supports TTS #3412 (#3656)

* feature: 'kokoro-js' supports TTS #3412

* Linting, add credits for kokoro library

* Fix voice preview

* Fix display languages on previews

* Fix settings restoration. Debounce model init on settings change

* Fix engine sorting

* Move TTS processing to a web worker. Remove unused gain setting

* Speaking rate fix

* Update status when recreating a worker

* Pass voices list from TTS engine

* Call dispose function on provider change

* Extend worker init timeout to 10 minutes

---------

Co-authored-by: ryan <1014670860@qq.com>
Co-authored-by: Cohee <18619528+Cohee1207@users.noreply.github.com>
This commit is contained in:
felger 2025-03-11 04:54:54 +08:00 committed by GitHub
parent 13a3f4772e
commit e23f3a6314
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 491 additions and 27 deletions

View File

@ -75,6 +75,7 @@ module.exports = {
'plugins/**',
'**/*.min.js',
'public/scripts/extensions/quick-reply/lib/**',
'public/scripts/extensions/tts/lib/**',
],
rules: {
'no-unused-vars': ['error', { args: 'none' }],

1
.github/readme.md vendored
View File

@ -393,6 +393,7 @@ GNU Affero General Public License for more details.**
* Icon theme by Font Awesome <https://fontawesome.com> (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
* Default content by @OtisAlejandro (Seraphina character and lorebook) and @kallmeflocc (10K Discord Users Celebratory Background)
* Docker guide by [@mrguymiah](https://github.com/mrguymiah) and [@Bronya-Rand](https://github.com/Bronya-Rand)
* kokoro-js library by [@hexgrad](https://github.com/hexgrad) (Apache-2.0 License)
## Top Contributors

View File

@ -14,7 +14,8 @@
"**/.git/**",
"lib/**",
"**/*.min.js",
"scripts/extensions/quick-reply/lib/**"
"scripts/extensions/quick-reply/lib/**",
"scripts/extensions/tts/lib/**"
],
"typeAcquisition": {
"include": []

View File

@ -9,6 +9,8 @@ import { QuickReplySettings } from './src/QuickReplySettings.js';
import { SlashCommandHandler } from './src/SlashCommandHandler.js';
import { ButtonUi } from './src/ui/ButtonUi.js';
import { SettingsUi } from './src/ui/SettingsUi.js';
import { debounceAsync } from '../../utils.js';
export { debounceAsync };
@ -17,32 +19,6 @@ const _VERBOSE = true;
export const debug = (...msg) => _VERBOSE ? console.debug('[QR2]', ...msg) : null;
export const log = (...msg) => _VERBOSE ? console.log('[QR2]', ...msg) : null;
export const warn = (...msg) => _VERBOSE ? console.warn('[QR2]', ...msg) : null;
/**
* Creates a debounced function that delays invoking func until after wait milliseconds have elapsed since the last time the debounced function was invoked.
* @param {Function} func The function to debounce.
* @param {Number} [timeout=300] The timeout in milliseconds.
* @returns {Function} The debounced function.
*/
export function debounceAsync(func, timeout = 300) {
let timer;
/**@type {Promise}*/
let debouncePromise;
/**@type {Function}*/
let debounceResolver;
return (...args) => {
clearTimeout(timer);
if (!debouncePromise) {
debouncePromise = new Promise(resolve => {
debounceResolver = resolve;
});
}
timer = setTimeout(() => {
debounceResolver(func.apply(this, args));
debouncePromise = null;
}, timeout);
return debouncePromise;
};
}
const defaultConfig = {

View File

@ -27,6 +27,7 @@ import { SlashCommandEnumValue, enumTypes } from '../../slash-commands/SlashComm
import { enumIcons } from '../../slash-commands/SlashCommandCommonEnumsProvider.js';
import { POPUP_TYPE, callGenericPopup } from '../../popup.js';
import { GoogleTranslateTtsProvider } from './google-translate.js';
import { KokoroTtsProvider } from './kokoro.js';
const UPDATE_INTERVAL = 1000;
const wrapper = new ModuleWorkerWrapper(moduleWorker);
@ -94,6 +95,7 @@ const ttsProviders = {
'Google Translate': GoogleTranslateTtsProvider,
GSVI: GSVITtsProvider,
'GPT-SoVITS-V2 (Unofficial)': GptSovitsV2Provider,
Kokoro: KokoroTtsProvider,
Novel: NovelTtsProvider,
OpenAI: OpenAITtsProvider,
'OpenAI Compatible': OpenAICompatibleTtsProvider,
@ -716,6 +718,9 @@ async function loadTtsProvider(provider) {
}
function onTtsProviderChange() {
if (typeof ttsProvider?.dispose === 'function') {
ttsProvider.dispose();
}
const ttsProviderSelection = $('#tts_provider').val();
extension_settings.tts.currentProvider = ttsProviderSelection;
$('#playback_rate_block').toggle(extension_settings.tts.currentProvider !== 'System');

View File

@ -0,0 +1,113 @@
// kokoro-worker.js
/** @type {import('./lib/kokoro.web.js').KokoroTTS} */
let tts = null;
/** @type {boolean} */
let ready = false;
/** @type {string[]} */
let voices = [];
// Handle messages from the main thread
self.onmessage = async function(e) {
const { action, data } = e.data;
switch (action) {
case 'initialize':
try {
const result = await initializeTts(data);
self.postMessage({
action: 'initialized',
success: result,
voices,
});
} catch (error) {
self.postMessage({
action: 'initialized',
success: false,
error: error.message,
});
}
break;
case 'generateTts':
try {
const audioBlob = await generateTts(data.text, data.voice, data.speakingRate);
const blobUrl = URL.createObjectURL(audioBlob);
self.postMessage({
action: 'generatedTts',
success: true,
blobUrl,
requestId: data.requestId,
});
} catch (error) {
self.postMessage({
action: 'generatedTts',
success: false,
error: error.message,
requestId: data.requestId,
});
}
break;
case 'checkReady':
self.postMessage({ action: 'readyStatus', ready });
break;
}
};
// Initialize the TTS engine
async function initializeTts(settings) {
try {
const { KokoroTTS } = await import('./lib/kokoro.web.js');
console.log('Worker: Initializing Kokoro TTS with settings:', {
modelId: settings.modelId,
dtype: settings.dtype,
device: settings.device,
});
// Create TTS instance
tts = await KokoroTTS.from_pretrained(settings.modelId, {
dtype: settings.dtype,
device: settings.device,
});
// Get available voices
voices = Object.keys(tts.voices);
// Check if generate method exists
if (typeof tts.generate !== 'function') {
throw new Error('TTS instance does not have generate method');
}
console.log('Worker: TTS initialized successfully');
ready = true;
return true;
} catch (error) {
console.error('Worker: Kokoro TTS initialization failed:', error);
ready = false;
throw error;
}
}
// Generate TTS audio
async function generateTts(text, voiceId, speakingRate) {
if (!ready || !tts) {
throw new Error('TTS engine not initialized');
}
if (text.trim().length === 0) {
throw new Error('Empty text');
}
try {
const audio = await tts.generate(text, {
voice: voiceId,
speed: speakingRate || 1.0,
});
return audio.toBlob();
} catch (error) {
console.error('Worker: TTS generation failed:', error);
throw error;
}
}

View File

@ -0,0 +1,326 @@
import { debounce_timeout } from '../../constants.js';
import { debounceAsync } from '../../utils.js';
import { getPreviewString, saveTtsProviderSettings } from './index.js';
export class KokoroTtsProvider {
constructor() {
this.settings = {
modelId: 'onnx-community/Kokoro-82M-v1.0-ONNX',
dtype: 'q8',
device: 'wasm',
voiceMap: {},
defaultVoice: 'af_heart',
speakingRate: 1.0,
};
this.ready = false;
this.voices = [
'af_heart',
'af_alloy',
'af_aoede',
'af_bella',
'af_jessica',
'af_kore',
'af_nicole',
'af_nova',
'af_river',
'af_sarah',
'af_sky',
'am_adam',
'am_echo',
'am_eric',
'am_fenrir',
'am_liam',
'am_michael',
'am_onyx',
'am_puck',
'am_santa',
'bf_emma',
'bf_isabella',
'bm_george',
'bm_lewis',
'bf_alice',
'bf_lily',
'bm_daniel',
'bm_fable',
];
this.worker = null;
this.separator = ' ... ... ... ';
this.pendingRequests = new Map();
this.nextRequestId = 1;
// Update display values immediately but only reinitialize TTS after a delay
this.initTtsDebounced = debounceAsync(this.initializeWorker.bind(this), debounce_timeout.relaxed);
}
async loadSettings(settings) {
if (settings.modelId !== undefined) this.settings.modelId = settings.modelId;
if (settings.dtype !== undefined) this.settings.dtype = settings.dtype;
if (settings.device !== undefined) this.settings.device = settings.device;
if (settings.voiceMap !== undefined) this.settings.voiceMap = settings.voiceMap;
if (settings.defaultVoice !== undefined) this.settings.defaultVoice = settings.defaultVoice;
if (settings.speakingRate !== undefined) this.settings.speakingRate = settings.speakingRate;
$('#kokoro_model_id').val(this.settings.modelId).on('input', this.onSettingsChange.bind(this));
$('#kokoro_dtype').val(this.settings.dtype).on('change', this.onSettingsChange.bind(this));
$('#kokoro_device').val(this.settings.device).on('change', this.onSettingsChange.bind(this));
$('#kokoro_speaking_rate').val(this.settings.speakingRate).on('input', this.onSettingsChange.bind(this));
$('#kokoro_speaking_rate_output').text(this.settings.speakingRate + 'x');
}
initializeWorker() {
return new Promise((resolve, reject) => {
try {
// Terminate the existing worker if it exists
if (this.worker) {
this.worker.terminate();
$('#kokoro_status_text').text('Initializing...').removeAttr('style');
}
// Create a new worker
this.worker = new Worker(new URL('./kokoro-worker.js', import.meta.url), { type: 'module' });
// Set up message handling
this.worker.onmessage = this.handleWorkerMessage.bind(this);
// Initialize the worker with the current settings
this.worker.postMessage({
action: 'initialize',
data: {
modelId: this.settings.modelId,
dtype: this.settings.dtype,
device: this.settings.device,
},
});
// Create a promise that will resolve when initialization completes
const initPromise = new Promise((initResolve, initReject) => {
const timeoutId = setTimeout(() => {
initReject(new Error('Worker initialization timed out'));
}, 600000); // 600 second timeout
this.pendingRequests.set('initialization', {
resolve: (result) => {
clearTimeout(timeoutId);
initResolve(result);
},
reject: (error) => {
clearTimeout(timeoutId);
initReject(error);
},
});
});
// Resolve the outer promise when initialization completes
initPromise.then(success => {
this.ready = success;
this.updateStatusDisplay();
resolve(success);
}).catch(error => {
console.error('Worker initialization failed:', error);
this.ready = false;
this.updateStatusDisplay();
reject(error);
});
} catch (error) {
console.error('Failed to create worker:', error);
this.ready = false;
this.updateStatusDisplay();
reject(error);
}
});
}
handleWorkerMessage(event) {
const { action, success, ready, error, requestId, blobUrl } = event.data;
switch (action) {
case 'initialized': {
const initRequest = this.pendingRequests.get('initialization');
if (initRequest) {
if (success) {
initRequest.resolve(true);
} else {
initRequest.reject(new Error(error || 'Initialization failed'));
}
this.pendingRequests.delete('initialization');
}
} break;
case 'generatedTts': {
const request = this.pendingRequests.get(requestId);
if (request) {
if (success) {
fetch(blobUrl).then(response => response.blob()).then(audioBlob => {
// Clean up the blob URL
URL.revokeObjectURL(blobUrl);
request.resolve(new Response(audioBlob, {
headers: {
'Content-Type': 'audio/wav',
},
}));
}).catch(error => {
request.reject(new Error('Failed to fetch TTS audio blob: ' + error));
});
} else {
request.reject(new Error(error || 'TTS generation failed'));
}
this.pendingRequests.delete(requestId);
}
} break;
case 'readyStatus':
this.ready = ready;
this.updateStatusDisplay();
break;
}
}
updateStatusDisplay() {
const statusText = this.ready ? 'Ready' : 'Failed';
const statusColor = this.ready ? 'green' : 'red';
$('#kokoro_status_text').text(statusText).css('color', statusColor);
}
async checkReady() {
if (!this.worker) {
return await this.initializeWorker();
}
this.worker.postMessage({ action: 'checkReady' });
return this.ready;
}
async onRefreshClick() {
return await this.initializeWorker();
}
get settingsHtml() {
return `
<div class="kokoro_tts_settings">
<label for="kokoro_model_id">Model ID:</label>
<input id="kokoro_model_id" type="text" class="text_pole" value="${this.settings.modelId}" />
<label for="kokoro_dtype">Data Type:</label>
<select id="kokoro_dtype" class="text_pole">
<option value="q8" ${this.settings.dtype === 'q8' ? 'selected' : ''}>q8 (Recommended)</option>
<option value="fp32" ${this.settings.dtype === 'fp32' ? 'selected' : ''}>fp32 (High Precision)</option>
<option value="fp16" ${this.settings.dtype === 'fp16' ? 'selected' : ''}>fp16</option>
<option value="q4" ${this.settings.dtype === 'q4' ? 'selected' : ''}>q4 (Low Memory)</option>
<option value="q4f16" ${this.settings.dtype === 'q4f16' ? 'selected' : ''}>q4f16</option>
</select>
<label for="kokoro_device">Device:</label>
<select id="kokoro_device" class="text_pole">
<option value="wasm" ${this.settings.device === 'wasm' ? 'selected' : ''}>WebAssembly (CPU)</option>
<option value="webgpu" ${this.settings.device === 'webgpu' ? 'selected' : ''}>WebGPU (GPU Acceleration)</option>
</select>
<label for="kokoro_speaking_rate">Speaking Rate: <span id="kokoro_speaking_rate_output">${this.settings.speakingRate}x</span></label>
<input id="kokoro_speaking_rate" type="range" value="${this.settings.speakingRate}" min="0.5" max="2.0" step="0.1" />
<hr>
<div>
Status: <span id="kokoro_status_text">Initializing...</span>
</div>
</div>
`;
}
async onSettingsChange() {
this.settings.modelId = $('#kokoro_model_id').val().toString();
this.settings.dtype = $('#kokoro_dtype').val().toString();
this.settings.device = $('#kokoro_device').val().toString();
this.settings.speakingRate = parseFloat($('#kokoro_speaking_rate').val().toString());
// Update UI display
$('#kokoro_speaking_rate_output').text(this.settings.speakingRate + 'x');
// Reinitialize TTS engine with debounce
this.initTtsDebounced();
saveTtsProviderSettings();
}
async fetchTtsVoiceObjects() {
if (!this.ready) {
await this.checkReady();
}
return this.voices.map(voice => ({
name: voice,
voice_id: voice,
preview_url: null,
lang: voice.startsWith('b') ? 'en-GB' : 'en-US',
}));
}
async previewTtsVoice(voiceId) {
if (!this.ready) {
await this.checkReady();
}
const voice = this.getVoice(voiceId);
const previewText = getPreviewString(voice.lang);
const response = await this.generateTts(previewText, voiceId);
const audio = await response.blob();
const url = URL.createObjectURL(audio);
const audioElement = new Audio();
audioElement.src = url;
audioElement.play();
audioElement.onended = () => URL.revokeObjectURL(url);
}
getVoiceDisplayName(voiceId) {
return voiceId;
}
getVoice(voiceName) {
const defaultVoice = this.settings.defaultVoice || 'af_heart';
const actualVoiceName = this.voices.includes(voiceName) ? voiceName : defaultVoice;
return {
name: actualVoiceName,
voice_id: actualVoiceName,
preview_url: null,
lang: actualVoiceName.startsWith('b') ? 'en-GB' : 'en-US',
};
}
async generateTts(text, voiceId) {
if (!this.ready || !this.worker) {
console.log('TTS not ready, initializing...');
await this.initializeWorker();
}
if (!this.ready || !this.worker) {
throw new Error('Failed to initialize TTS engine');
}
if (text.trim().length === 0) {
throw new Error('Empty text');
}
const voice = this.getVoice(voiceId);
const requestId = this.nextRequestId++;
return new Promise((resolve, reject) => {
// Store the promise callbacks
this.pendingRequests.set(requestId, { resolve, reject });
// Send the request to the worker
this.worker.postMessage({
action: 'generateTts',
data: {
text,
voice: voice.voice_id,
speakingRate: this.settings.speakingRate || 1.0,
requestId,
},
});
});
}
dispose() {
// Clean up the worker when the provider is disposed
if (this.worker) {
this.worker.terminate();
this.worker = null;
}
}
}

View File

@ -0,0 +1,8 @@
# kokoro-js
* Author: hexgrad
* NPM: <https://www.npmjs.com/package/kokoro-js>
* Version: 1.2.0
* License: Apache-2.0
Last updated: 2025-03-10

File diff suppressed because one or more lines are too long

View File

@ -16,6 +16,7 @@ Exported for use in extension index.js, and added to providers list in index.js
1. previewTtsVoice()
2. separator field
3. processText(text)
4. dispose()
# Requirement Descriptions
### generateTts(text, voiceId)
@ -74,3 +75,7 @@ Defines the string of characters used to introduce separation between between th
### processText(text)
Optional.
A function applied to the input text before passing it to the TTS generator. Can be async.
### dispose()
Optional.
Function to handle cleanup of provider resources when the provider is switched.

View File

@ -436,6 +436,33 @@ export function debounce(func, timeout = debounce_timeout.standard) {
return fn;
}
/**
* Creates a debounced function that delays invoking func until after wait milliseconds have elapsed since the last time the debounced function was invoked.
* @param {Function} func The function to debounce.
* @param {Number} [timeout=300] The timeout in milliseconds.
* @returns {Function} The debounced function.
*/
export function debounceAsync(func, timeout = debounce_timeout.standard) {
let timer;
/**@type {Promise}*/
let debouncePromise;
/**@type {Function}*/
let debounceResolver;
return (...args) => {
clearTimeout(timer);
if (!debouncePromise) {
debouncePromise = new Promise(resolve => {
debounceResolver = resolve;
});
}
timer = setTimeout(() => {
debounceResolver(func.apply(this, args));
debouncePromise = null;
}, timeout);
return debouncePromise;
};
}
/**
* Cancels a scheduled debounced function.
* Does nothing if the function is not debounced or not scheduled.