mirror of
https://github.com/SillyTavern/SillyTavern.git
synced 2025-06-05 21:59:27 +02:00
Add endpoint for transformers.js TTS
This commit is contained in:
@@ -54,6 +54,7 @@ extras:
|
|||||||
embeddingModel: Cohee/jina-embeddings-v2-base-en
|
embeddingModel: Cohee/jina-embeddings-v2-base-en
|
||||||
promptExpansionModel: Cohee/fooocus_expansion-onnx
|
promptExpansionModel: Cohee/fooocus_expansion-onnx
|
||||||
speechToTextModel: Xenova/whisper-small
|
speechToTextModel: Xenova/whisper-small
|
||||||
|
textToSpeechModel: Xenova/speecht5_tts
|
||||||
# -- OPENAI CONFIGURATION --
|
# -- OPENAI CONFIGURATION --
|
||||||
openai:
|
openai:
|
||||||
# Will send a random user ID to OpenAI completion API
|
# Will send a random user ID to OpenAI completion API
|
||||||
|
@@ -54,17 +54,25 @@ router.post('/recognize', jsonParser, async (req, res) => {
|
|||||||
|
|
||||||
router.post('/synthesize', jsonParser, async (req, res) => {
|
router.post('/synthesize', jsonParser, async (req, res) => {
|
||||||
try {
|
try {
|
||||||
|
const wavefile = require('wavefile');
|
||||||
const TASK = 'text-to-speech';
|
const TASK = 'text-to-speech';
|
||||||
const { model, text, lang } = req.body;
|
const { text, model, speaker } = req.body;
|
||||||
const module = await import('../transformers.mjs');
|
const module = await import('../transformers.mjs');
|
||||||
const pipe = await module.default.getPipeline(TASK, model);
|
const pipe = await module.default.getPipeline(TASK, model);
|
||||||
|
const speaker_embeddings = speaker
|
||||||
|
? new Float32Array(new Uint8Array(Buffer.from(speaker.split(',')[1], 'base64')).buffer)
|
||||||
|
: null;
|
||||||
const start = performance.now();
|
const start = performance.now();
|
||||||
const result = await pipe(text, { language: lang || null });
|
const result = await pipe(text, { speaker_embeddings: speaker_embeddings });
|
||||||
const end = performance.now();
|
const end = performance.now();
|
||||||
console.log(`Execution duration: ${(end - start) / 1000} seconds`);
|
console.log(`Execution duration: ${(end - start) / 1000} seconds`);
|
||||||
console.log('Synthesized audio:', result.audio);
|
|
||||||
|
|
||||||
return res.json({ audio: result.audio });
|
const wav = new wavefile.WaveFile();
|
||||||
|
wav.fromScratch(1, result.sampling_rate, '32f', result.audio);
|
||||||
|
const buffer = wav.toBuffer();
|
||||||
|
|
||||||
|
res.set('Content-Type', 'audio/wav');
|
||||||
|
return res.send(Buffer.from(buffer));
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
console.error(error);
|
||||||
return res.sendStatus(500);
|
return res.sendStatus(500);
|
||||||
|
@@ -17,26 +17,37 @@ const tasks = {
|
|||||||
defaultModel: 'Cohee/distilbert-base-uncased-go-emotions-onnx',
|
defaultModel: 'Cohee/distilbert-base-uncased-go-emotions-onnx',
|
||||||
pipeline: null,
|
pipeline: null,
|
||||||
configField: 'extras.classificationModel',
|
configField: 'extras.classificationModel',
|
||||||
|
quantized: true,
|
||||||
},
|
},
|
||||||
'image-to-text': {
|
'image-to-text': {
|
||||||
defaultModel: 'Xenova/vit-gpt2-image-captioning',
|
defaultModel: 'Xenova/vit-gpt2-image-captioning',
|
||||||
pipeline: null,
|
pipeline: null,
|
||||||
configField: 'extras.captioningModel',
|
configField: 'extras.captioningModel',
|
||||||
|
quantized: true,
|
||||||
},
|
},
|
||||||
'feature-extraction': {
|
'feature-extraction': {
|
||||||
defaultModel: 'Xenova/all-mpnet-base-v2',
|
defaultModel: 'Xenova/all-mpnet-base-v2',
|
||||||
pipeline: null,
|
pipeline: null,
|
||||||
configField: 'extras.embeddingModel',
|
configField: 'extras.embeddingModel',
|
||||||
|
quantized: true,
|
||||||
},
|
},
|
||||||
'text-generation': {
|
'text-generation': {
|
||||||
defaultModel: 'Cohee/fooocus_expansion-onnx',
|
defaultModel: 'Cohee/fooocus_expansion-onnx',
|
||||||
pipeline: null,
|
pipeline: null,
|
||||||
configField: 'extras.promptExpansionModel',
|
configField: 'extras.promptExpansionModel',
|
||||||
|
quantized: true,
|
||||||
},
|
},
|
||||||
'automatic-speech-recognition': {
|
'automatic-speech-recognition': {
|
||||||
defaultModel: 'Xenova/whisper-small',
|
defaultModel: 'Xenova/whisper-small',
|
||||||
pipeline: null,
|
pipeline: null,
|
||||||
configField: 'extras.speechToTextModel',
|
configField: 'extras.speechToTextModel',
|
||||||
|
quantized: true,
|
||||||
|
},
|
||||||
|
'text-to-speech': {
|
||||||
|
defaultModel: 'Xenova/speecht5_tts',
|
||||||
|
pipeline: null,
|
||||||
|
configField: 'extras.textToSpeechModel',
|
||||||
|
quantized: false,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -90,7 +101,7 @@ async function getPipeline(task, forceModel = '') {
|
|||||||
const model = forceModel || getModelForTask(task);
|
const model = forceModel || getModelForTask(task);
|
||||||
const localOnly = getConfigValue('extras.disableAutoDownload', false);
|
const localOnly = getConfigValue('extras.disableAutoDownload', false);
|
||||||
console.log('Initializing transformers.js pipeline for task', task, 'with model', model);
|
console.log('Initializing transformers.js pipeline for task', task, 'with model', model);
|
||||||
const instance = await pipeline(task, model, { cache_dir, quantized: true, local_files_only: localOnly });
|
const instance = await pipeline(task, model, { cache_dir, quantized: tasks[task].quantized ?? true, local_files_only: localOnly });
|
||||||
tasks[task].pipeline = instance;
|
tasks[task].pipeline = instance;
|
||||||
return instance;
|
return instance;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user