From f5fccc0387a07d32a3916b85858facbf69c7b1bb Mon Sep 17 00:00:00 2001
From: Cohee <18619528+Cohee1207@users.noreply.github.com>
Date: Wed, 22 May 2024 01:37:51 +0300
Subject: [PATCH] Add Azure TTS service
---
public/scripts/extensions/tts/azure.js | 207 +++++++++++++++++++++++++
public/scripts/extensions/tts/index.js | 2 +
public/scripts/secrets.js | 1 +
server.js | 3 +
src/endpoints/azure.js | 92 +++++++++++
src/endpoints/secrets.js | 1 +
6 files changed, 306 insertions(+)
create mode 100644 public/scripts/extensions/tts/azure.js
create mode 100644 src/endpoints/azure.js
diff --git a/public/scripts/extensions/tts/azure.js b/public/scripts/extensions/tts/azure.js
new file mode 100644
index 000000000..abbf3ef33
--- /dev/null
+++ b/public/scripts/extensions/tts/azure.js
@@ -0,0 +1,207 @@
+import { callPopup, getRequestHeaders } from '../../../script.js';
+import { SECRET_KEYS, findSecret, secret_state, writeSecret } from '../../secrets.js';
+import { getPreviewString, saveTtsProviderSettings } from './index.js';
+export { AzureTtsProvider };
+
+class AzureTtsProvider {
+ //########//
+ // Config //
+ //########//
+
+ settings;
+ voices = [];
+ separator = ' . ';
+ audioElement = document.createElement('audio');
+
+ defaultSettings = {
+ region: '',
+ voiceMap: {},
+ };
+
+ get settingsHtml() {
+ let html = `
+
+ `;
+ return html;
+ }
+
+ onSettingsChange() {
+ // Update dynamically
+ this.settings.region = String($('#azure_tts_region').val());
+ // Reset voices
+ this.voices = [];
+ saveTtsProviderSettings();
+ }
+
+ async loadSettings(settings) {
+ // Populate Provider UI given input settings
+ if (Object.keys(settings).length == 0) {
+ console.info('Using default TTS Provider settings');
+ }
+
+ // Only accept keys defined in defaultSettings
+ this.settings = this.defaultSettings;
+
+ for (const key in settings) {
+ if (key in this.settings) {
+ this.settings[key] = settings[key];
+ } else {
+ throw `Invalid setting passed to TTS Provider: ${key}`;
+ }
+ }
+
+ $('#azure_tts_region').val(this.settings.region).on('input', () => this.onSettingsChange());
+ $('#azure_tts_key').toggleClass('success', secret_state[SECRET_KEYS.AZURE_TTS]);
+ $('#azure_tts_key').on('click', async () => {
+ const popupText = 'Azure TTS API Key';
+ const savedKey = secret_state[SECRET_KEYS.AZURE_TTS] ? await findSecret(SECRET_KEYS.AZURE_TTS) : '';
+
+ const key = await callPopup(popupText, 'input', savedKey);
+
+ if (key == false || key == '') {
+ return;
+ }
+
+ await writeSecret(SECRET_KEYS.AZURE_TTS, key);
+
+ toastr.success('API Key saved');
+ $('#azure_tts_key').addClass('success');
+ await this.onRefreshClick();
+ });
+
+ try {
+ await this.checkReady();
+ console.debug('Azure: Settings loaded');
+ } catch {
+ console.debug('Azure: Settings loaded, but not ready');
+ }
+ }
+
+ // Perform a simple readiness check by trying to fetch voiceIds
+ async checkReady() {
+ if (secret_state[SECRET_KEYS.AZURE_TTS]) {
+ await this.fetchTtsVoiceObjects();
+ } else {
+ this.voices = [];
+ }
+ }
+
+ async onRefreshClick() {
+ await this.checkReady();
+ }
+
+ //#################//
+ // TTS Interfaces //
+ //#################//
+
+ async getVoice(voiceName) {
+ if (this.voices.length == 0) {
+ this.voices = await this.fetchTtsVoiceObjects();
+ }
+ const match = this.voices.filter(
+ voice => voice.name == voiceName,
+ )[0];
+ if (!match) {
+ throw `TTS Voice name ${voiceName} not found`;
+ }
+ return match;
+ }
+
+ async generateTts(text, voiceId) {
+ const response = await this.fetchTtsGeneration(text, voiceId);
+ return response;
+ }
+
+ //###########//
+ // API CALLS //
+ //###########//
+ async fetchTtsVoiceObjects() {
+ if (!secret_state[SECRET_KEYS.AZURE_TTS]) {
+ console.warn('Azure TTS API Key not set');
+ return [];
+ }
+
+ if (!this.settings.region) {
+ console.warn('Azure TTS region not set');
+ return [];
+ }
+
+ const response = await fetch('/api/azure/list', {
+ method: 'POST',
+ headers: getRequestHeaders(),
+ body: JSON.stringify({
+ region: this.settings.region,
+ }),
+ });
+
+ if (!response.ok) {
+ throw new Error(`HTTP ${response.status}: ${await response.text()}`);
+ }
+ let responseJson = await response.json();
+ responseJson = responseJson
+ .sort((a, b) => a.Locale.localeCompare(b.Locale) || a.ShortName.localeCompare(b.ShortName))
+ .map(x => ({ name: x.ShortName, voice_id: x.ShortName, preview_url: false, lang: x.Locale }));
+ return responseJson;
+ }
+
+ /**
+ * Preview TTS for a given voice ID.
+ * @param {string} id Voice ID
+ */
+ async previewTtsVoice(id) {
+ this.audioElement.pause();
+ this.audioElement.currentTime = 0;
+ const voice = await this.getVoice(id);
+ const text = getPreviewString(voice.lang);
+ const response = await this.fetchTtsGeneration(text, id);
+ if (!response.ok) {
+ throw new Error(`HTTP ${response.status}: ${await response.text()}`);
+ }
+
+ const audio = await response.blob();
+ const url = URL.createObjectURL(audio);
+ this.audioElement.src = url;
+ this.audioElement.play();
+ URL.revokeObjectURL(url);
+ }
+
+ async fetchTtsGeneration(text, voiceId) {
+ if (!secret_state[SECRET_KEYS.AZURE_TTS]) {
+ throw new Error('Azure TTS API Key not set');
+ }
+
+ if (!this.settings.region) {
+ throw new Error('Azure TTS region not set');
+ }
+
+ const response = await fetch('/api/azure/generate', {
+ method: 'POST',
+ headers: getRequestHeaders(),
+ body: JSON.stringify({
+ text: text,
+ voice: voiceId,
+ region: this.settings.region,
+ }),
+ });
+
+ if (!response.ok) {
+ toastr.error(response.statusText, 'TTS Generation Failed');
+ throw new Error(`HTTP ${response.status}: ${await response.text()}`);
+ }
+
+ return response;
+ }
+}
diff --git a/public/scripts/extensions/tts/index.js b/public/scripts/extensions/tts/index.js
index f73e9d48f..1ac1edd8b 100644
--- a/public/scripts/extensions/tts/index.js
+++ b/public/scripts/extensions/tts/index.js
@@ -13,6 +13,7 @@ import { XTTSTtsProvider } from './xtts.js';
import { GSVITtsProvider } from './gsvi.js';
import { AllTalkTtsProvider } from './alltalk.js';
import { SpeechT5TtsProvider } from './speecht5.js';
+import { AzureTtsProvider } from './azure.js';
import { SlashCommandParser } from '../../slash-commands/SlashCommandParser.js';
import { SlashCommand } from '../../slash-commands/SlashCommand.js';
import { ARGUMENT_TYPE, SlashCommandArgument, SlashCommandNamedArgument } from '../../slash-commands/SlashCommandArgument.js';
@@ -83,6 +84,7 @@ const ttsProviders = {
OpenAI: OpenAITtsProvider,
AllTalk: AllTalkTtsProvider,
SpeechT5: SpeechT5TtsProvider,
+ Azure: AzureTtsProvider,
};
let ttsProvider;
let ttsProviderName;
diff --git a/public/scripts/secrets.js b/public/scripts/secrets.js
index 4d7a6ebf1..cb4477a78 100644
--- a/public/scripts/secrets.js
+++ b/public/scripts/secrets.js
@@ -27,6 +27,7 @@ export const SECRET_KEYS = {
COHERE: 'api_key_cohere',
PERPLEXITY: 'api_key_perplexity',
GROQ: 'api_key_groq',
+ AZURE_TTS: 'api_key_azure_tts',
};
const INPUT_MAP = {
diff --git a/server.js b/server.js
index e658d7b5e..6f67dc287 100644
--- a/server.js
+++ b/server.js
@@ -519,6 +519,9 @@ app.use('/api/backends/scale-alt', require('./src/endpoints/backends/scale-alt')
// Speech (text-to-speech and speech-to-text)
app.use('/api/speech', require('./src/endpoints/speech').router);
+// Azure TTS
+app.use('/api/azure', require('./src/endpoints/azure').router);
+
const tavernUrl = new URL(
(cliArguments.ssl ? 'https://' : 'http://') +
(listen ? '0.0.0.0' : '127.0.0.1') +
diff --git a/src/endpoints/azure.js b/src/endpoints/azure.js
new file mode 100644
index 000000000..4c3b34d5b
--- /dev/null
+++ b/src/endpoints/azure.js
@@ -0,0 +1,92 @@
+const { readSecret, SECRET_KEYS } = require('./secrets');
+const fetch = require('node-fetch').default;
+const express = require('express');
+const { jsonParser } = require('../express-common');
+
+const router = express.Router();
+
+router.post('/list', jsonParser, async (req, res) => {
+ try {
+ const key = readSecret(req.user.directories, SECRET_KEYS.AZURE_TTS);
+
+ if (!key) {
+ console.error('Azure TTS API Key not set');
+ return res.sendStatus(403);
+ }
+
+ const region = req.body.region;
+
+ if (!region) {
+ console.error('Azure TTS region not set');
+ return res.sendStatus(400);
+ }
+
+ const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/voices/list`;
+
+ const response = await fetch(url, {
+ method: 'GET',
+ headers: {
+ 'Ocp-Apim-Subscription-Key': key,
+ },
+ });
+
+ if (!response.ok) {
+ console.error('Azure Request failed', response.status, response.statusText);
+ return res.sendStatus(500);
+ }
+
+ const voices = await response.json();
+ return res.json(voices);
+ } catch (error) {
+ console.error('Azure Request failed', error);
+ return res.sendStatus(500);
+ }
+});
+
+router.post('/generate', jsonParser, async (req, res) => {
+ try {
+ const key = readSecret(req.user.directories, SECRET_KEYS.AZURE_TTS);
+
+ if (!key) {
+ console.error('Azure TTS API Key not set');
+ return res.sendStatus(403);
+ }
+
+ const { text, voice, region } = req.body;
+ if (!text || !voice || !region) {
+ console.error('Missing required parameters');
+ return res.sendStatus(400);
+ }
+
+ const url = `https://${region}.tts.speech.microsoft.com/cognitiveservices/v1`;
+ const lang = String(voice).split('-').slice(0, 2).join('-');
+ const escapedText = String(text).replace(/&/g, '&').replace(//g, '>');
+ const ssml = `${escapedText}`;
+
+ const response = await fetch(url, {
+ method: 'POST',
+ headers: {
+ 'Ocp-Apim-Subscription-Key': key,
+ 'Content-Type': 'application/ssml+xml',
+ 'X-Microsoft-OutputFormat': 'ogg-48khz-16bit-mono-opus',
+ },
+ body: ssml,
+ });
+
+ if (!response.ok) {
+ console.error('Azure Request failed', response.status, response.statusText);
+ return res.sendStatus(500);
+ }
+
+ const audio = await response.buffer();
+ res.set('Content-Type', 'audio/ogg');
+ return res.send(audio);
+ } catch (error) {
+ console.error('Azure Request failed', error);
+ return res.sendStatus(500);
+ }
+});
+
+module.exports = {
+ router,
+};
diff --git a/src/endpoints/secrets.js b/src/endpoints/secrets.js
index 1a1ac3746..9bf2eb765 100644
--- a/src/endpoints/secrets.js
+++ b/src/endpoints/secrets.js
@@ -39,6 +39,7 @@ const SECRET_KEYS = {
COHERE: 'api_key_cohere',
PERPLEXITY: 'api_key_perplexity',
GROQ: 'api_key_groq',
+ AZURE_TTS: 'api_key_azure_tts',
};
// These are the keys that are safe to expose, even if allowKeysExposure is false