From 142772c3df64ebbeb12e852b33330b13f763ac86 Mon Sep 17 00:00:00 2001
From: jason-on-salt-a40 <pyp@utexas.edu>
Date: Fri, 5 Apr 2024 16:42:59 -0700
Subject: [PATCH] upload TTS finetuned 330M model

---
 README.md           | 33 ++++++++++++++++++---------------
 inference_tts.ipynb |  6 +++---
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 59cc368..8b7f984 100644
--- a/README.md
+++ b/README.md
@@ -7,21 +7,7 @@ VoiceCraft is a token infilling neural codec language model, that achieves state
 
 To clone or edit an unseen voice, VoiceCraft needs only a few seconds of reference.
 
-## News
-:star: 03/28/2024: Model weights are up on HuggingFace🤗 [here](https://huggingface.co/pyp1/VoiceCraft/tree/main)!
-
-## TODO
-- [x] Codebase upload
-- [x] Environment setup
-- [x] Inference demo for speech editing and TTS
-- [x] Training guidance
-- [x] RealEdit dataset and training manifest
-- [x] Model weights (both 330M and 830M, the former seems to be just as good)
-- [x] Write colab notebooks for better hands-on experience
-- [ ] HuggingFace Spaces demo
-- [ ] Better guidance on training/finetuning
-
-## How to run TTS inference
+## How to run inference
 There are three ways:
 
 1. with Google Colab. see [quickstart colab](#quickstart-colab)
@@ -32,6 +18,23 @@ When you are inside the docker image or you have installed all dependencies, Che
 
 If you want to do model development such as training/finetuning, I recommend following [envrionment setup](#environment-setup) and [training](#training).
 
+## News
+:star: 03/28/2024: Model weights for giga330M and giga830M are up on HuggingFace🤗 [here](https://huggingface.co/pyp1/VoiceCraft/tree/main)!
+
+:star: 04/05/2024: I finetuned giga330M with the TTS objective on gigaspeech and 1/5 of librilight, the model outperforms giga830M on TTS. Weights are [here](https://huggingface.co/pyp1/VoiceCraft/tree/main). Make sure maximal prompt + generation length <= 16 seconds (due to our limited compute, we had to drop utterances longer than 16s in training data)
+
+## TODO
+- [x] Codebase upload
+- [x] Environment setup
+- [x] Inference demo for speech editing and TTS
+- [x] Training guidance
+- [x] RealEdit dataset and training manifest
+- [x] Model weights (giga330M.pth, giga830M.pth, and gigaHalfLibri330M_TTSEnhanced_max16s.pth)
+- [x] Write colab notebooks for better hands-on experience
+- [ ] HuggingFace Spaces demo
+- [ ] Better guidance on training/finetuning
+
+
 ## QuickStart Colab
 
 :star: To try out speech editing or TTS Inference with VoiceCraft, the simplest way is using Google Colab.
diff --git a/inference_tts.ipynb b/inference_tts.ipynb
index 3cce38d..f18270c 100644
--- a/inference_tts.ipynb
+++ b/inference_tts.ipynb
@@ -63,7 +63,7 @@
     "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
     "from models import voicecraft\n",
     "#import models.voicecraft as voicecraft\n",
-    "voicecraft_name=\"giga830M.pth\" # or giga330M.pth\n",
+    "voicecraft_name=\"gigaHalfLibri330M_TTSEnhanced_max16s.pth\" # or giga330M.pth, giga830M.pth\n",
     "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
     "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
     "if not os.path.exists(ckpt_fn):\n",
@@ -141,14 +141,14 @@
     "codec_audio_sr = 16000\n",
     "codec_sr = 50\n",
     "top_k = 0\n",
-    "top_p = 0.8\n",
+    "top_p = 0.9 # can also try 0.8, but 0.9 seems to work better\n",
     "temperature = 1\n",
     "silence_tokens=[1388,1898,131]\n",
     "kvcache = 1 # NOTE if OOM, change this to 0, or try the 330M model\n",
     "\n",
     "# NOTE adjust the below three arguments if the generation is not as good\n",
     "stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1\n",
-    "sample_batch_size = 4 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
+    "sample_batch_size = 2 # for gigaHalfLibri330M_TTSEnhanced_max16s.pth, 1 or 2 should be fine since the model is trained to do TTS, for the other two models, might need a higher number. NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
     "seed = 1 # change seed if you are still unhappy with the result\n",
     "\n",
     "def seed_everything(seed):\n",