diff --git a/.gitignore b/.gitignore
index 9135f6d..2647f93 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,8 @@ thumbs.db
*.png
*.wav
*.mp3
+*.pth
+*.th
*durip*
*rtx*
diff --git a/README.md b/README.md
index 26d4c1b..8df6f58 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,15 @@
# VoiceCraft: Zero-Shot Speech Editing and Text-to-Speech in the Wild
[Demo](https://jasonppy.github.io/VoiceCraft_web) [Paper](https://jasonppy.github.io/assets/pdfs/VoiceCraft.pdf)
+
### TL;DR
VoiceCraft is a token infilling neural codec language model, that achieves state-of-the-art performance on both **speech editing** and **zero-shot text-to-speech (TTS)** on in-the-wild data including audiobooks, internet videos, and podcasts.
To clone or edit an unseen voice, VoiceCraft needs only a few seconds of reference.
+## News
+:star: 03/28/2024: Model weights are up on HuggingFace🤗 [here](https://huggingface.co/pyp1/VoiceCraft/tree/main)!
+
## TODO
The TODOs left will be completed by the end of March 2024.
@@ -13,8 +17,9 @@ The TODOs left will be completed by the end of March 2024.
- [x] Environment setup
- [x] Inference demo for speech editing and TTS
- [x] Training guidance
-- [x] Upload the RealEdit dataset and training manifest
-- [ ] Upload model weights (encodec weights are up)
+- [x] RealEdit dataset and training manifest
+- [x] Model weights (both 330M and 830M, the former seems to be just as good but way faster)
+- [ ] More
## Environment setup
diff --git a/demo/temp/mfa_alignments/84_121550_000074_000000.csv b/demo/temp/mfa_alignments/84_121550_000074_000000.csv
index ee0750b..bdf51b5 100644
--- a/demo/temp/mfa_alignments/84_121550_000074_000000.csv
+++ b/demo/temp/mfa_alignments/84_121550_000074_000000.csv
@@ -1,12 +1,12 @@
Begin,End,Label,Type,Speaker
0.03,0.18,but,words,temp
0.18,0.32,when,words,temp
-0.32,0.49,i,words,temp
-0.49,0.64,had,words,temp
+0.32,0.48,i,words,temp
+0.48,0.64,had,words,temp
0.64,1.19,approached,words,temp
1.22,1.58,so,words,temp
-1.58,1.9,near,words,temp
-1.9,2.07,to,words,temp
+1.58,1.91,near,words,temp
+1.91,2.07,to,words,temp
2.07,2.42,them,words,temp
2.53,2.61,the,words,temp
2.61,3.01,common,words,temp
@@ -19,8 +19,8 @@ Begin,End,Label,Type,Speaker
5.54,6.0,not,words,temp
6.0,6.14,by,words,temp
6.14,6.67,distance,words,temp
-6.79,7.06,any,words,temp
-7.06,7.18,of,words,temp
+6.79,7.05,any,words,temp
+7.05,7.18,of,words,temp
7.18,7.34,its,words,temp
7.34,7.87,marks,words,temp
0.03,0.06,B,phones,temp
@@ -29,22 +29,22 @@ Begin,End,Label,Type,Speaker
0.18,0.23,W,phones,temp
0.23,0.27,EH1,phones,temp
0.27,0.32,N,phones,temp
-0.32,0.49,AY1,phones,temp
-0.49,0.5,HH,phones,temp
-0.5,0.6,AE1,phones,temp
+0.32,0.48,AY1,phones,temp
+0.48,0.49,HH,phones,temp
+0.49,0.6,AE1,phones,temp
0.6,0.64,D,phones,temp
0.64,0.7,AH0,phones,temp
0.7,0.83,P,phones,temp
-0.83,0.87,R,phones,temp
-0.87,0.99,OW1,phones,temp
+0.83,0.88,R,phones,temp
+0.88,0.99,OW1,phones,temp
0.99,1.12,CH,phones,temp
1.12,1.19,T,phones,temp
1.22,1.4,S,phones,temp
1.4,1.58,OW1,phones,temp
1.58,1.7,N,phones,temp
1.7,1.84,IH1,phones,temp
-1.84,1.9,R,phones,temp
-1.9,2.01,T,phones,temp
+1.84,1.91,R,phones,temp
+1.91,2.01,T,phones,temp
2.01,2.07,AH0,phones,temp
2.07,2.13,DH,phones,temp
2.13,2.3,EH1,phones,temp
@@ -75,8 +75,8 @@ Begin,End,Label,Type,Speaker
4.34,4.42,D,phones,temp
4.42,4.45,IH0,phones,temp
4.45,4.59,S,phones,temp
-4.59,4.8,IY1,phones,temp
-4.8,4.87,V,phones,temp
+4.59,4.79,IY1,phones,temp
+4.79,4.87,V,phones,temp
4.87,4.97,Z,phones,temp
5.04,5.12,L,phones,temp
5.12,5.33,AO1,phones,temp
@@ -96,14 +96,14 @@ Begin,End,Label,Type,Speaker
6.57,6.67,S,phones,temp
6.79,6.89,EH1,phones,temp
6.89,6.95,N,phones,temp
-6.95,7.06,IY0,phones,temp
-7.06,7.13,AH0,phones,temp
+6.95,7.05,IY0,phones,temp
+7.05,7.13,AH0,phones,temp
7.13,7.18,V,phones,temp
7.18,7.22,IH0,phones,temp
7.22,7.29,T,phones,temp
7.29,7.34,S,phones,temp
7.34,7.39,M,phones,temp
-7.39,7.49,AA1,phones,temp
-7.49,7.58,R,phones,temp
-7.58,7.69,K,phones,temp
-7.69,7.87,S,phones,temp
+7.39,7.5,AA1,phones,temp
+7.5,7.58,R,phones,temp
+7.58,7.7,K,phones,temp
+7.7,7.87,S,phones,temp
diff --git a/inference_speech_editing.ipynb b/inference_speech_editing.ipynb
index 64340f7..67f49cd 100644
--- a/inference_speech_editing.ipynb
+++ b/inference_speech_editing.ipynb
@@ -8,7 +8,7 @@
"source": [
"import os\n",
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",
- "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\""
]
},
{
@@ -47,15 +47,17 @@
"# hyperparameters for inference\n",
"left_margin = 0.08\n",
"right_margin = 0.08\n",
- "seed = 1\n",
"codec_audio_sr = 16000\n",
"codec_sr = 50\n",
"top_k = 0\n",
"top_p = 0.8\n",
"temperature = 1\n",
"kvcache = 0\n",
- "silence_tokens = [1388,1898,131]\n",
- "stop_repetition = -1 # do not stop repetition on silence\n",
+ "# adjust the below three arguments if the generation is not as good\n",
+ "seed = 1 # random seed magic\n",
+ "silence_tokens = [1388,1898,131] # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n",
+ "stop_repetition = -1 # -1 means do not adjust prob of silence tokens. if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n",
+ "# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"\n",
"# point to the original file or record the file\n",
@@ -72,7 +74,7 @@
"# run MFA to get the alignment\n",
"align_temp = f\"{temp_folder}/mfa_alignments\"\n",
"os.makedirs(align_temp, exist_ok=True)\n",
- "os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n",
+ "# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n",
"# if it fail, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",
"# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n",
"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
@@ -83,15 +85,56 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
- "name": "stderr",
+ "name": "stdout",
"output_type": "stream",
"text": [
- "WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1)\n"
+ "original:\n"
]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "edited:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
],
"source": [
@@ -132,8 +175,15 @@
"mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now\n",
"\n",
"# load model, tokenizer, and other necessary files\n",
- "ckpt_fn = \"/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/pretrained_830M/best_bundle.pth\"\n",
- "encodec_fn = \"/data/scratch/pyp/exp_pyp/audiocraft/encodec/xps/6f79c6a8/checkpoint.th\"\n",
+ "voicecraft_name=\"giga330M.pth\"\n",
+ "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
+ "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
+ "if not os.path.exists(ckpt_fn):\n",
+ " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n",
+ " os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n",
+ "if not os.path.exists(encodec_fn):\n",
+ " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",
+ " os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",
"ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n",
"model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
"model.load_state_dict(ckpt[\"model\"])\n",
@@ -155,26 +205,35 @@
"orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu()\n",
"# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n",
"\n",
- "# output_dir\n",
- "output_dir = \"./demo/generated_se\"\n",
- "os.makedirs(output_dir, exist_ok=True)\n",
+ "# display the audio\n",
+ "from IPython.display import Audio\n",
+ "print(\"original:\")\n",
+ "display(Audio(orig_audio, rate=codec_audio_sr))\n",
"\n",
- "save_fn_new = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{seed}.wav\"\n",
+ "print(\"edited:\")\n",
+ "display(Audio(new_audio, rate=codec_audio_sr))\n",
"\n",
- "torchaudio.save(save_fn_new, new_audio, codec_audio_sr)\n",
+ "# # save the audio\n",
+ "# # output_dir\n",
+ "# output_dir = \"./demo/generated_se\"\n",
+ "# os.makedirs(output_dir, exist_ok=True)\n",
"\n",
- "save_fn_orig = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav\"\n",
- "if not os.path.isfile(save_fn_orig):\n",
- " orig_audio, orig_sr = torchaudio.load(audio_fn)\n",
- " if orig_sr != codec_audio_sr:\n",
- " orig_audio = torchaudio.transforms.Resample(orig_sr, codec_audio_sr)(orig_audio)\n",
- " torchaudio.save(save_fn_orig, orig_audio, codec_audio_sr)\n",
+ "# save_fn_new = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{seed}.wav\"\n",
"\n",
- "# if you get error importing T5 in transformers\n",
- "# try \n",
- "# pip uninstall Pillow\n",
- "# pip install Pillow\n",
- "# you are likely to get warning looks like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored"
+ "# torchaudio.save(save_fn_new, new_audio, codec_audio_sr)\n",
+ "\n",
+ "# save_fn_orig = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav\"\n",
+ "# if not os.path.isfile(save_fn_orig):\n",
+ "# orig_audio, orig_sr = torchaudio.load(audio_fn)\n",
+ "# if orig_sr != codec_audio_sr:\n",
+ "# orig_audio = torchaudio.transforms.Resample(orig_sr, codec_audio_sr)(orig_audio)\n",
+ "# torchaudio.save(save_fn_orig, orig_audio, codec_audio_sr)\n",
+ "\n",
+ "# # if you get error importing T5 in transformers\n",
+ "# # try \n",
+ "# # pip uninstall Pillow\n",
+ "# # pip install Pillow\n",
+ "# # you are likely to get warning looks like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored"
]
},
{
diff --git a/inference_tts.ipynb b/inference_tts.ipynb
index 75c25a2..89be66c 100644
--- a/inference_tts.ipynb
+++ b/inference_tts.ipynb
@@ -8,23 +8,14 @@
"source": [
"import os\n",
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",
- "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/home/pyp/miniconda3/envs/voicecraft/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
- " from .autonotebook import tqdm as notebook_tqdm\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# import libs\n",
"import torch\n",
@@ -42,33 +33,223 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Setting up corpus information\u001b[33m...\u001b[0m \n",
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Loading corpus from source files\u001b[33m...\u001b[0m \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/100 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
+ "\u001b[?25h"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Found \u001b[1;36m1\u001b[0m speaker across \u001b[1;36m1\u001b[0m file, average number of utterances per \n",
+ "\u001b[2;36m \u001b[0m speaker: \u001b[1;36m1.0\u001b[0m \n",
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Initializing multiprocessing jobs\u001b[33m...\u001b[0m \n",
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Normalizing text\u001b[33m...\u001b[0m \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
+ "\u001b[?25h"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Creating corpus split for feature generation\u001b[33m...\u001b[0m \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/2 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
+ "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating MFCCs\u001b[33m...\u001b[0m \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
+ "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Calculating CMVN\u001b[33m...\u001b[0m \n",
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating final features\u001b[33m...\u001b[0m \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
+ "\u001b[?25h"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Creating corpus split with features\u001b[33m...\u001b[0m \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
+ "\u001b[?25h"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Compiling training graphs\u001b[33m...\u001b[0m \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
+ "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Performing first-pass alignment\u001b[33m...\u001b[0m \n",
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating alignments\u001b[33m...\u001b[0m \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
+ "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Calculating fMLLR for speaker adaptation\u001b[33m...\u001b[0m \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
+ "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Performing second-pass alignment\u001b[33m...\u001b[0m \n",
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating alignments\u001b[33m...\u001b[0m \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
+ "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Collecting phone and word alignments from alignment lattices\u001b[33m...\u001b[0m \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
+ "\u001b[?25h"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2;36m \u001b[0m\u001b[33mWARNING \u001b[0m Alignment analysis not available without using postgresql \n",
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Exporting alignment TextGrids to demo/temp/mfa_alignments\u001b[33m...\u001b[0m \n",
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Finished exporting TextGrids to demo/temp/mfa_alignments! \n",
+ "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Done! Everything took \u001b[1;36m40.634\u001b[0m seconds \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
+ "\u001b[?25h"
+ ]
+ }
+ ],
"source": [
"# hyperparameters for inference\n",
- "left_margin = 0.08\n",
- "right_margin = 0.08\n",
- "seed = 1\n",
+ "left_margin = 0.08 # not used for TTS, only for speech editing\n",
+ "right_margin = 0.08 # not used for TTS, only for speech editing\n",
"codec_audio_sr = 16000\n",
"codec_sr = 50\n",
"top_k = 0\n",
"top_p = 0.8\n",
"temperature = 1\n",
- "kvcache = 0\n",
+ "kvcache = 1\n",
"silence_tokens=[1388,1898,131]\n",
- "# if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n",
- "stop_repetition = 2\n",
- "# if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n",
+ "# adjust the below three arguments if the generation is not as good\n",
+ "seed = 1 # random seed magic\n",
+ "stop_repetition = 3 # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n",
+ "sample_batch_size = 4 # if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n",
"# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest\n",
- "sample_batch_size = 1\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"\n",
"# point to the original file or record the file\n",
"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
- "orig_audio = \"/home/pyp/VoiceCraft/demo/84_121550_000074_000000.wav\"\n",
+ "orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
"orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
"\n",
"# move the audio and transcript to temp folder\n",
- "temp_folder = \"/home/pyp/VoiceCraft/demo/temp\"\n",
+ "temp_folder = \"./demo/temp\"\n",
"os.makedirs(temp_folder, exist_ok=True)\n",
"os.system(f\"cp {orig_audio} {temp_folder}\")\n",
"filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",
@@ -87,20 +268,61 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
- "name": "stderr",
+ "name": "stdout",
"output_type": "stream",
"text": [
- "Dora directory: /tmp/audiocraft_pyp\n"
+ "concatenate prompt and generated:\n"
]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "generated:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
],
"source": [
"# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",
- "cut_off_sec = 3.01 # according to forced-alignment file, the word \"common\" stop as 3.01 sec\n",
+ "cut_off_sec = 3.01 # NOTE: according to forced-alignment file, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
"target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",
"info = torchaudio.info(audio_fn)\n",
"audio_dur = info.num_frames / info.sample_rate\n",
@@ -111,8 +333,16 @@
"\n",
"# # load model, tokenizer, and other necessary files\n",
"from models import voicecraft\n",
- "ckpt_fn = \"/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/pretrained_830M/best_bundle.pth\"\n",
- "encodec_fn = \"/data/scratch/pyp/exp_pyp/audiocraft/encodec/xps/6f79c6a8/checkpoint.th\"\n",
+ "voicecraft_name=\"giga830M.pth\"\n",
+ "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
+ "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
+ "if not os.path.exists(ckpt_fn):\n",
+ " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n",
+ " os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n",
+ "if not os.path.exists(encodec_fn):\n",
+ " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",
+ " os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",
+ "\n",
"ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n",
"model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
"model.load_state_dict(ckpt[\"model\"])\n",
@@ -133,15 +363,24 @@
"concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()\n",
"# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n",
"\n",
- "# output_dir\n",
- "output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n",
- "os.makedirs(output_dir, exist_ok=True)\n",
"\n",
- "seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n",
- "seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\" \n",
+ "# display the audio\n",
+ "from IPython.display import Audio\n",
+ "print(\"concatenate prompt and generated:\")\n",
+ "display(Audio(concated_audio, rate=codec_audio_sr))\n",
"\n",
- "torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n",
- "torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n",
+ "print(\"generated:\")\n",
+ "display(Audio(gen_audio, rate=codec_audio_sr))\n",
+ "\n",
+ "# # save the audio\n",
+ "# # output_dir\n",
+ "# output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n",
+ "# os.makedirs(output_dir, exist_ok=True)\n",
+ "# seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n",
+ "# seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\" \n",
+ "\n",
+ "# torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n",
+ "# torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n",
"\n",
"# if you get error importing T5 in transformers\n",
"# try \n",
diff --git a/pretrained_models/.gitkeep b/pretrained_models/.gitkeep
new file mode 100644
index 0000000..e69de29