diff --git a/.cog/tmp/build1355928786/cog-0.0.1.dev-py3-none-any.whl b/.cog/tmp/build1355928786/cog-0.0.1.dev-py3-none-any.whl deleted file mode 100644 index 1f420bc..0000000 Binary files a/.cog/tmp/build1355928786/cog-0.0.1.dev-py3-none-any.whl and /dev/null differ diff --git a/.cog/tmp/build1355928786/requirements.txt b/.cog/tmp/build1355928786/requirements.txt deleted file mode 100644 index f0afe9f..0000000 --- a/.cog/tmp/build1355928786/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 -torch==2.0.1 -torchaudio==2.0.2 -xformers==0.0.22 -tensorboard==2.16.2 -phonemizer==3.2.1 -datasets==2.16.0 -torchmetrics==0.11.1 \ No newline at end of file diff --git a/.cog/tmp/build1650802162/cog-0.0.1.dev-py3-none-any.whl b/.cog/tmp/build1650802162/cog-0.0.1.dev-py3-none-any.whl deleted file mode 100644 index 1f420bc..0000000 Binary files a/.cog/tmp/build1650802162/cog-0.0.1.dev-py3-none-any.whl and /dev/null differ diff --git a/.cog/tmp/build1650802162/requirements.txt b/.cog/tmp/build1650802162/requirements.txt deleted file mode 100644 index f0afe9f..0000000 --- a/.cog/tmp/build1650802162/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 -torch==2.0.1 -torchaudio==2.0.2 -xformers==0.0.22 -tensorboard==2.16.2 -phonemizer==3.2.1 -datasets==2.16.0 -torchmetrics==0.11.1 \ No newline at end of file diff --git a/.cog/tmp/build2219159647/cog-0.0.1.dev-py3-none-any.whl b/.cog/tmp/build2219159647/cog-0.0.1.dev-py3-none-any.whl deleted file mode 100644 index 1f420bc..0000000 Binary files a/.cog/tmp/build2219159647/cog-0.0.1.dev-py3-none-any.whl and /dev/null differ diff --git a/.cog/tmp/build2219159647/requirements.txt b/.cog/tmp/build2219159647/requirements.txt deleted file mode 100644 index f0afe9f..0000000 --- a/.cog/tmp/build2219159647/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 -torch==2.0.1 -torchaudio==2.0.2 -xformers==0.0.22 -tensorboard==2.16.2 -phonemizer==3.2.1 -datasets==2.16.0 -torchmetrics==0.11.1 \ No newline at end of file diff --git a/.cog/tmp/build2697867472/cog-0.0.1.dev-py3-none-any.whl b/.cog/tmp/build2697867472/cog-0.0.1.dev-py3-none-any.whl deleted file mode 100644 index 1f420bc..0000000 Binary files a/.cog/tmp/build2697867472/cog-0.0.1.dev-py3-none-any.whl and /dev/null differ diff --git a/.cog/tmp/build2697867472/requirements.txt b/.cog/tmp/build2697867472/requirements.txt deleted file mode 100644 index f0afe9f..0000000 --- a/.cog/tmp/build2697867472/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 -torch==2.0.1 -torchaudio==2.0.2 -xformers==0.0.22 -tensorboard==2.16.2 -phonemizer==3.2.1 -datasets==2.16.0 -torchmetrics==0.11.1 \ No newline at end of file diff --git a/.cog/tmp/build3457113023/cog-0.0.1.dev-py3-none-any.whl b/.cog/tmp/build3457113023/cog-0.0.1.dev-py3-none-any.whl deleted file mode 100644 index 1f420bc..0000000 Binary files a/.cog/tmp/build3457113023/cog-0.0.1.dev-py3-none-any.whl and /dev/null differ diff --git a/.cog/tmp/build3457113023/requirements.txt b/.cog/tmp/build3457113023/requirements.txt deleted file mode 100644 index f0afe9f..0000000 --- a/.cog/tmp/build3457113023/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 -torch==2.0.1 -torchaudio==2.0.2 -xformers==0.0.22 -tensorboard==2.16.2 -phonemizer==3.2.1 -datasets==2.16.0 -torchmetrics==0.11.1 \ No newline at end of file diff --git a/.cog/tmp/build4228484396/cog-0.0.1.dev-py3-none-any.whl b/.cog/tmp/build4228484396/cog-0.0.1.dev-py3-none-any.whl deleted file mode 100644 index 1f420bc..0000000 Binary files a/.cog/tmp/build4228484396/cog-0.0.1.dev-py3-none-any.whl and /dev/null differ diff --git a/.cog/tmp/build4228484396/requirements.txt b/.cog/tmp/build4228484396/requirements.txt deleted file mode 100644 index f0afe9f..0000000 --- a/.cog/tmp/build4228484396/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cu118 -torch==2.0.1 -torchaudio==2.0.2 -xformers==0.0.22 -tensorboard==2.16.2 -phonemizer==3.2.1 -datasets==2.16.0 -torchmetrics==0.11.1 \ No newline at end of file diff --git a/README.md b/README.md index 160582b..8b75cfb 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # VoiceCraft: Zero-Shot Speech Editing and Text-to-Speech in the Wild -[![Paper](https://img.shields.io/badge/arXiv-2301.12503-brightgreen.svg?style=flat-square)](https://jasonppy.github.io/assets/pdfs/VoiceCraft.pdf) [![githubio](https://img.shields.io/badge/GitHub.io-Audio_Samples-blue?logo=Github&style=flat-square)](https://jasonppy.github.io/VoiceCraft_web/) [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/pyp1/VoiceCraft_gradio) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1IOjpglQyMTO2C3Y94LD9FY0Ocn-RJRg6?usp=sharing) [![Replicate](https://replicate.com/cjwbw/voicecraft/badge)](https://replicate.com/cjwbw/voicecraft) +[![Paper](https://img.shields.io/badge/arXiv-2403.16973-brightgreen.svg?style=flat-square)](https://arxiv.org/pdf/2403.16973.pdf) [![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/pyp1/VoiceCraft_gradio) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1IOjpglQyMTO2C3Y94LD9FY0Ocn-RJRg6?usp=sharing) [![Replicate](https://replicate.com/cjwbw/voicecraft/badge)](https://replicate.com/cjwbw/voicecraft) [![YouTube demo](https://img.shields.io/youtube/comments/eikybOi8iwU)](https://youtu.be/eikybOi8iwU) [![Demo page](https://img.shields.io/badge/GitHub.io-Audio_Samples-blue?logo=Github&style=flat-square)](https://jasonppy.github.io/VoiceCraft_web/) ### TL;DR @@ -19,6 +19,8 @@ When you are inside the docker image or you have installed all dependencies, Che If you want to do model development such as training/finetuning, I recommend following [envrionment setup](#environment-setup) and [training](#training). ## News +:star: 04/22/2024: 330M/830M TTS Enhanced Models are up [here](https://huggingface.co/pyp1), load them through [`gradio_app.py`](./gradio_app.py) or [`inference_tts.ipynb`](./inference_tts.ipynb)! Replicate demo is up, major thanks to [@chenxwh](https://github.com/chenxwh)! + :star: 04/11/2024: VoiceCraft Gradio is now available on HuggingFace Spaces [here](https://huggingface.co/spaces/pyp1/VoiceCraft_gradio)! Major thanks to [@zuev-stepan](https://github.com/zuev-stepan), [@Sewlell](https://github.com/Sewlell), [@pgsoar](https://github.com/pgosar) [@Ph0rk0z](https://github.com/Ph0rk0z). :star: 04/05/2024: I finetuned giga330M with the TTS objective on gigaspeech and 1/5 of librilight. Weights are [here](https://huggingface.co/pyp1/VoiceCraft/tree/main). Make sure maximal prompt + generation length <= 16 seconds (due to our limited compute, we had to drop utterances longer than 16s in training data). Even stronger models forthcomming, stay tuned! @@ -31,7 +33,7 @@ If you want to do model development such as training/finetuning, I recommend fol - [x] Inference demo for speech editing and TTS - [x] Training guidance - [x] RealEdit dataset and training manifest -- [x] Model weights (giga330M.pth, giga830M.pth, and gigaHalfLibri330M_TTSEnhanced_max16s.pth) +- [x] Model weights - [x] Better guidance on training/finetuning - [x] Colab notebooks - [x] HuggingFace Spaces demo @@ -211,7 +213,7 @@ We thank Feiteng for his [VALL-E reproduction](https://github.com/lifeiteng/vall ## Citation ``` @article{peng2024voicecraft, - author = {Peng, Puyuan and Huang, Po-Yao and Li, Daniel and Mohamed, Abdelrahman and Harwath, David}, + author = {Peng, Puyuan and Huang, Po-Yao and Mohamed, Abdelrahman and Harwath, David}, title = {VoiceCraft: Zero-Shot Speech Editing and Text-to-Speech in the Wild}, journal = {arXiv}, year = {2024}, diff --git a/gradio_app.py b/gradio_app.py index fc62c4a..3b4c128 100644 --- a/gradio_app.py +++ b/gradio_app.py @@ -85,7 +85,7 @@ def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, elif voicecraft_model_name == "830M": voicecraft_model_name = "giga830M" elif voicecraft_model_name == "330M_TTSEnhanced": - voicecraft_model_name = "gigaHalfLibri330M_TTSEnhanced_max16s" + voicecraft_model_name = "330M_TTSEnhanced" elif voicecraft_model_name == "830M_TTSEnhanced": voicecraft_model_name = "830M_TTSEnhanced" diff --git a/inference_tts.ipynb b/inference_tts.ipynb index c2a65ad..5b62d21 100644 --- a/inference_tts.ipynb +++ b/inference_tts.ipynb @@ -71,7 +71,7 @@ "# load model, encodec, and phn2num\n", "# # load model, tokenizer, and other necessary files\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - "voicecraft_name=\"830M_TTSEnhanced.pth\" # or giga330M.pth, gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n", + "voicecraft_name=\"830M_TTSEnhanced.pth\" # or giga330M.pth, 330M_TTSEnhanced.pth, giga830M.pth\n", "\n", "# the new way of loading the model, with huggingface, recommended\n", "from models import voicecraft\n",