From 96f6f9fc7a40028753a60e093fc49e4010a0a458 Mon Sep 17 00:00:00 2001
From: pyp_l40 <peng_puyuan@outlook.com>
Date: Mon, 22 Apr 2024 11:56:39 -0500
Subject: [PATCH] better handle numbers

---
 gradio_app.py           | 13 +++++++++++++
 gradio_requirements.txt |  1 +
 2 files changed, 14 insertions(+)

diff --git a/gradio_app.py b/gradio_app.py
index 1e84732..fc62c4a 100644
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -1,4 +1,6 @@
 import os
+import re
+from num2words import num2words
 import gradio as gr
 import torch
 import torchaudio
@@ -201,6 +203,15 @@ def get_output_audio(audio_tensors, codec_audio_sr):
     buffer.seek(0)
     return buffer.read()
 
+def replace_numbers_with_words(sentence):
+    sentence = re.sub(r'(\d+)', r' \1 ', sentence) # add spaces around numbers
+    def replace_with_words(match):
+        num = match.group(0)
+        try:
+            return num2words(num) # Convert numbers to words
+        except:
+            return num # In case num2words fails (unlikely with digits but just to be safe)
+    return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
 
 def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, temperature,
         stop_repetition, sample_batch_size, kvcache, silence_tokens,
@@ -213,6 +224,8 @@ def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
         raise gr.Error("Can't use smart transcript: whisper transcript not found")
 
     seed_everything(seed)
+    transcript = replace_numbers_with_words(transcript).replace("  ", " ").replace("  ", " ") # replace numbers with words, so that the phonemizer can do a better job
+
     if mode == "Long TTS":
         if split_text == "Newline":
             sentences = transcript.split('\n')
diff --git a/gradio_requirements.txt b/gradio_requirements.txt
index e9e7635..5b2958a 100644
--- a/gradio_requirements.txt
+++ b/gradio_requirements.txt
@@ -4,3 +4,4 @@ openai-whisper>=20231117
 aeneas>=1.7.3.0
 whisperx>=3.1.1
 huggingface_hub==0.22.2
+num2words==0.5.13