From 8cb0fda321a70adfb4a7f62d65ef470b192f723b Mon Sep 17 00:00:00 2001 From: Cohee <18619528+Cohee1207@users.noreply.github.com> Date: Sat, 12 Oct 2024 13:35:11 +0300 Subject: [PATCH] TTS: Add Google Translate TTS --- .../extensions/tts/google-translate.js | 140 ++++++++++++++++++ public/scripts/extensions/tts/index.js | 2 + src/endpoints/google.js | 26 +++- 3 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 public/scripts/extensions/tts/google-translate.js diff --git a/public/scripts/extensions/tts/google-translate.js b/public/scripts/extensions/tts/google-translate.js new file mode 100644 index 000000000..fa90aa4c1 --- /dev/null +++ b/public/scripts/extensions/tts/google-translate.js @@ -0,0 +1,140 @@ +import { getRequestHeaders } from '../../../script.js'; +import { splitRecursive } from '../../utils.js'; +import { getPreviewString, saveTtsProviderSettings } from './index.js'; +export { GoogleTranslateTtsProvider }; + +class GoogleTranslateTtsProvider { + settings; + voices = []; + separator = ' . '; + audioElement = document.createElement('audio'); + + defaultSettings = { + region: '', + voiceMap: {}, + }; + + get settingsHtml() { + return ''; + } + + onSettingsChange() { + this.voices = []; + saveTtsProviderSettings(); + } + + async loadSettings(settings) { + // Populate Provider UI given input settings + if (Object.keys(settings).length == 0) { + console.info('Using default TTS Provider settings'); + } + + // Only accept keys defined in defaultSettings + this.settings = this.defaultSettings; + + for (const key in settings) { + if (key in this.settings) { + this.settings[key] = settings[key]; + } else { + throw `Invalid setting passed to TTS Provider: ${key}`; + } + } + + try { + await this.checkReady(); + console.debug('Google Translate TTS: Settings loaded'); + } catch { + console.debug('Google Translate TTS: Settings loaded, but not ready'); + } + } + + // Perform a simple readiness check by trying to fetch voiceIds + async checkReady() { + await this.fetchTtsVoiceObjects(); + } + + async onRefreshClick() { + await this.checkReady(); + } + + //#################// + // TTS Interfaces // + //#################// + + async getVoice(voiceName) { + if (this.voices.length == 0) { + this.voices = await this.fetchTtsVoiceObjects(); + } + const match = this.voices.filter( + voice => voice.name == voiceName || voice.voice_id == voiceName, + )[0]; + if (!match) { + throw `TTS Voice name ${voiceName} not found`; + } + return match; + } + + async generateTts(text, voiceId) { + const response = await this.fetchTtsGeneration(text, voiceId); + return response; + } + + //###########// + // API CALLS // + //###########// + async fetchTtsVoiceObjects() { + const response = await fetch('/api/google/list-voices', { + method: 'POST', + headers: getRequestHeaders(), + body: JSON.stringify({}), + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${await response.text()}`); + } + let responseJson = await response.json(); + responseJson = Object.entries(responseJson) + .sort((a, b) => a[1].localeCompare(b[1])) + .map(x => ({ name: x[1], voice_id: x[0], preview_url: false, lang: x[0] })); + return responseJson; + } + + /** + * Preview TTS for a given voice ID. + * @param {string} id Voice ID + */ + async previewTtsVoice(id) { + this.audioElement.pause(); + this.audioElement.currentTime = 0; + const voice = await this.getVoice(id); + const text = getPreviewString(voice.lang); + const response = await this.fetchTtsGeneration(text, id); + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${await response.text()}`); + } + + const audio = await response.blob(); + const url = URL.createObjectURL(audio); + this.audioElement.src = url; + this.audioElement.play(); + this.audioElement.onended = () => URL.revokeObjectURL(url); + } + + async fetchTtsGeneration(text, voiceId) { + const response = await fetch('/api/google/generate-voice', { + method: 'POST', + headers: getRequestHeaders(), + body: JSON.stringify({ + text: splitRecursive(text, 200), + voice: voiceId, + }), + }); + + if (!response.ok) { + toastr.error(response.statusText, 'TTS Generation Failed'); + throw new Error(`HTTP ${response.status}: ${await response.text()}`); + } + + return response; + } +} diff --git a/public/scripts/extensions/tts/index.js b/public/scripts/extensions/tts/index.js index 4c5314af0..922a16e4f 100644 --- a/public/scripts/extensions/tts/index.js +++ b/public/scripts/extensions/tts/index.js @@ -26,6 +26,7 @@ import { debounce_timeout } from '../../constants.js'; import { SlashCommandEnumValue, enumTypes } from '../../slash-commands/SlashCommandEnumValue.js'; import { enumIcons } from '../../slash-commands/SlashCommandCommonEnumsProvider.js'; import { POPUP_TYPE, callGenericPopup } from '../../popup.js'; +import { GoogleTranslateTtsProvider } from './google-translate.js'; export { talkingAnimation }; const UPDATE_INTERVAL = 1000; @@ -91,6 +92,7 @@ const ttsProviders = { 'CosyVoice (Unofficial)': CosyVoiceProvider, Edge: EdgeTtsProvider, ElevenLabs: ElevenLabsTtsProvider, + 'Google Translate': GoogleTranslateTtsProvider, GSVI: GSVITtsProvider, 'GPT-SoVITS-V2 (Unofficial)': GptSovitsV2Provider, Novel: NovelTtsProvider, diff --git a/src/endpoints/google.js b/src/endpoints/google.js index b0ba87260..c9b6c6956 100644 --- a/src/endpoints/google.js +++ b/src/endpoints/google.js @@ -1,8 +1,10 @@ +import { Buffer } from 'node:buffer'; import fetch from 'node-fetch'; import express from 'express'; +import { speak, languages } from 'google-translate-api-x'; import { readSecret, SECRET_KEYS } from './secrets.js'; -import { jsonParser } from '../express-common.js'; +import { jsonParser } from '../express-common.js'; import { GEMINI_SAFETY } from '../constants.js'; const API_MAKERSUITE = 'https://generativelanguage.googleapis.com'; @@ -68,3 +70,25 @@ router.post('/caption-image', jsonParser, async (request, response) => { response.status(500).send('Internal server error'); } }); + +router.post('/list-voices', (_, response) => { + return response.json(languages); +}); + +router.post('/generate-voice', jsonParser, async (request, response) => { + try { + const text = request.body.text; + const voice = request.body.voice ?? 'en'; + + const result = await speak(text, { to: voice, forceBatch: false }); + const buffer = Array.isArray(result) + ? Buffer.concat(result.map(x => new Uint8Array(Buffer.from(x.toString(), 'base64')))) + : Buffer.from(result.toString(), 'base64'); + + response.setHeader('Content-Type', 'audio/mpeg'); + return response.send(buffer); + } catch (error) { + console.error('Google Translate TTS generation failed', error); + response.status(500).send('Internal server error'); + } +});