diff --git a/README.md b/README.md index ae7c95b..278c7de 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,8 @@ When you are inside the docker image or you have installed all dependencies, Che If you want to do model development such as training/finetuning, I recommend following [envrionment setup](#environment-setup) and [training](#training). ## News +:star: 03/15/2025: change inference sampling from topp=1 to topk=40 massively improve editing and TTS performance + :star: 04/22/2024: 330M/830M TTS Enhanced Models are up [here](https://huggingface.co/pyp1), load them through [`gradio_app.py`](./gradio_app.py) or [`inference_tts.ipynb`](./inference_tts.ipynb)! Replicate demo is up, major thanks to [@chenxwh](https://github.com/chenxwh)! :star: 04/11/2024: VoiceCraft Gradio is now available on HuggingFace Spaces [here](https://huggingface.co/spaces/pyp1/VoiceCraft_gradio)! Major thanks to [@zuev-stepan](https://github.com/zuev-stepan), [@Sewlell](https://github.com/Sewlell), [@pgsoar](https://github.com/pgosar) [@Ph0rk0z](https://github.com/Ph0rk0z). diff --git a/gradio_app.py b/gradio_app.py index bcf220d..69a7f14 100644 --- a/gradio_app.py +++ b/gradio_app.py @@ -512,9 +512,9 @@ def get_app(): info="set to 0 to use less VRAM, but with slower inference") left_margin = gr.Number(label="left_margin", value=0.08, info="margin to the left of the editing segment") right_margin = gr.Number(label="right_margin", value=0.08, info="margin to the right of the editing segment") - top_p = gr.Number(label="top_p", value=0.9, info="0.9 is a good value, 0.8 is also good") + top_p = gr.Number(label="top_p", value=1, info="do not do topp sampling therefore set it to 1") temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change") - top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling") + top_k = gr.Number(label="top_k", value=40, info="40 is a good default, can also try 20, 30") codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change') codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change') silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change") diff --git a/inference_speech_editing.ipynb b/inference_speech_editing.ipynb index a0b5cd5..e531711 100644 --- a/inference_speech_editing.ipynb +++ b/inference_speech_editing.ipynb @@ -66,8 +66,8 @@ "right_margin = 0.08\n", "codec_audio_sr = 16000\n", "codec_sr = 50\n", - "top_k = 0\n", - "top_p = 0.8\n", + "top_k = 40\n", + "top_p = 1\n", "temperature = 1\n", "kvcache = 0\n", "# adjust the below three arguments if the generation is not as good\n", diff --git a/inference_tts.ipynb b/inference_tts.ipynb index 5b62d21..156f051 100644 --- a/inference_tts.ipynb +++ b/inference_tts.ipynb @@ -157,8 +157,8 @@ "# hyperparameters for inference\n", "codec_audio_sr = 16000\n", "codec_sr = 50\n", - "top_k = 0\n", - "top_p = 0.9 # can also try 0.8, but 0.9 seems to work better\n", + "top_k = 40 # can also try 20, 30, 50\n", + "top_p = 1 # 1 means do not do top-p sampling\n", "temperature = 1\n", "silence_tokens=[1388,1898,131]\n", "kvcache = 1 # NOTE if OOM, change this to 0, or try the 330M model\n", diff --git a/inference_tts_scale.py b/inference_tts_scale.py index 9915b22..320545e 100644 --- a/inference_tts_scale.py +++ b/inference_tts_scale.py @@ -25,8 +25,8 @@ def get_args(): parser.add_argument("--seed", type=int, default=1) parser.add_argument("--codec_audio_sr", type=int, default=16000, help='the sample rate of audio that the codec is trained for') parser.add_argument("--codec_sr", type=int, default=50, help='the sample rate of the codec codes') - parser.add_argument("--top_k", type=int, default=0, help="sampling param") - parser.add_argument("--top_p", type=float, default=0.8, help="sampling param") + parser.add_argument("--top_k", type=int, default=40, help="sampling param") + parser.add_argument("--top_p", type=float, default=1, help="sampling param") parser.add_argument("--temperature", type=float, default=1.0, help="sampling param") parser.add_argument("--output_dir", type=str, default=None) parser.add_argument("--device", type=str, default="cuda") diff --git a/predict.py b/predict.py index 951be42..1791d85 100644 --- a/predict.py +++ b/predict.py @@ -184,7 +184,7 @@ class Predictor(BasePredictor): ), top_p: float = Input( description="Default value for TTS is 0.9, and 0.8 for speech editing", - default=0.9, + default=1, ), stop_repetition: int = Input( default=3, @@ -234,7 +234,7 @@ class Predictor(BasePredictor): # hyperparameters for inference codec_audio_sr = 16000 codec_sr = 50 - top_k = 0 + top_k = 40 silence_tokens = [1388, 1898, 131] if voicecraft_model == "giga330M_TTSEnhanced.pth":