diff --git a/default/config.yaml b/default/config.yaml index bb4847c3..5925d573 100644 --- a/default/config.yaml +++ b/default/config.yaml @@ -54,6 +54,7 @@ extras: embeddingModel: Cohee/jina-embeddings-v2-base-en promptExpansionModel: Cohee/fooocus_expansion-onnx speechToTextModel: Xenova/whisper-small + textToSpeechModel: Xenova/speecht5_tts # -- OPENAI CONFIGURATION -- openai: # Will send a random user ID to OpenAI completion API diff --git a/src/endpoints/speech.js b/src/endpoints/speech.js index de5e758c..713c420a 100644 --- a/src/endpoints/speech.js +++ b/src/endpoints/speech.js @@ -54,17 +54,25 @@ router.post('/recognize', jsonParser, async (req, res) => { router.post('/synthesize', jsonParser, async (req, res) => { try { + const wavefile = require('wavefile'); const TASK = 'text-to-speech'; - const { model, text, lang } = req.body; + const { text, model, speaker } = req.body; const module = await import('../transformers.mjs'); const pipe = await module.default.getPipeline(TASK, model); + const speaker_embeddings = speaker + ? new Float32Array(new Uint8Array(Buffer.from(speaker.split(',')[1], 'base64')).buffer) + : null; const start = performance.now(); - const result = await pipe(text, { language: lang || null }); + const result = await pipe(text, { speaker_embeddings: speaker_embeddings }); const end = performance.now(); console.log(`Execution duration: ${(end - start) / 1000} seconds`); - console.log('Synthesized audio:', result.audio); - return res.json({ audio: result.audio }); + const wav = new wavefile.WaveFile(); + wav.fromScratch(1, result.sampling_rate, '32f', result.audio); + const buffer = wav.toBuffer(); + + res.set('Content-Type', 'audio/wav'); + return res.send(Buffer.from(buffer)); } catch (error) { console.error(error); return res.sendStatus(500); diff --git a/src/transformers.mjs b/src/transformers.mjs index 3a30edf6..e0465f30 100644 --- a/src/transformers.mjs +++ b/src/transformers.mjs @@ -17,26 +17,37 @@ const tasks = { defaultModel: 'Cohee/distilbert-base-uncased-go-emotions-onnx', pipeline: null, configField: 'extras.classificationModel', + quantized: true, }, 'image-to-text': { defaultModel: 'Xenova/vit-gpt2-image-captioning', pipeline: null, configField: 'extras.captioningModel', + quantized: true, }, 'feature-extraction': { defaultModel: 'Xenova/all-mpnet-base-v2', pipeline: null, configField: 'extras.embeddingModel', + quantized: true, }, 'text-generation': { defaultModel: 'Cohee/fooocus_expansion-onnx', pipeline: null, configField: 'extras.promptExpansionModel', + quantized: true, }, 'automatic-speech-recognition': { defaultModel: 'Xenova/whisper-small', pipeline: null, configField: 'extras.speechToTextModel', + quantized: true, + }, + 'text-to-speech': { + defaultModel: 'Xenova/speecht5_tts', + pipeline: null, + configField: 'extras.textToSpeechModel', + quantized: false, }, } @@ -90,7 +101,7 @@ async function getPipeline(task, forceModel = '') { const model = forceModel || getModelForTask(task); const localOnly = getConfigValue('extras.disableAutoDownload', false); console.log('Initializing transformers.js pipeline for task', task, 'with model', model); - const instance = await pipeline(task, model, { cache_dir, quantized: true, local_files_only: localOnly }); + const instance = await pipeline(task, model, { cache_dir, quantized: tasks[task].quantized ?? true, local_files_only: localOnly }); tasks[task].pipeline = instance; return instance; }