diff --git a/.gitignore b/.gitignore index 9135f6d..2647f93 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,8 @@ thumbs.db *.png *.wav *.mp3 +*.pth +*.th *durip* *rtx* diff --git a/README.md b/README.md index 26d4c1b..8df6f58 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,15 @@ # VoiceCraft: Zero-Shot Speech Editing and Text-to-Speech in the Wild [Demo](https://jasonppy.github.io/VoiceCraft_web) [Paper](https://jasonppy.github.io/assets/pdfs/VoiceCraft.pdf) + ### TL;DR VoiceCraft is a token infilling neural codec language model, that achieves state-of-the-art performance on both **speech editing** and **zero-shot text-to-speech (TTS)** on in-the-wild data including audiobooks, internet videos, and podcasts. To clone or edit an unseen voice, VoiceCraft needs only a few seconds of reference. +## News +:star: 03/28/2024: Model weights are up on HuggingFace🤗 [here](https://huggingface.co/pyp1/VoiceCraft/tree/main)! + ## TODO The TODOs left will be completed by the end of March 2024. @@ -13,8 +17,9 @@ The TODOs left will be completed by the end of March 2024. - [x] Environment setup - [x] Inference demo for speech editing and TTS - [x] Training guidance -- [x] Upload the RealEdit dataset and training manifest -- [ ] Upload model weights (encodec weights are up) +- [x] RealEdit dataset and training manifest +- [x] Model weights (both 330M and 830M, the former seems to be just as good but way faster) +- [ ] More ## Environment setup diff --git a/demo/temp/mfa_alignments/84_121550_000074_000000.csv b/demo/temp/mfa_alignments/84_121550_000074_000000.csv index ee0750b..bdf51b5 100644 --- a/demo/temp/mfa_alignments/84_121550_000074_000000.csv +++ b/demo/temp/mfa_alignments/84_121550_000074_000000.csv @@ -1,12 +1,12 @@ Begin,End,Label,Type,Speaker 0.03,0.18,but,words,temp 0.18,0.32,when,words,temp -0.32,0.49,i,words,temp -0.49,0.64,had,words,temp +0.32,0.48,i,words,temp +0.48,0.64,had,words,temp 0.64,1.19,approached,words,temp 1.22,1.58,so,words,temp -1.58,1.9,near,words,temp -1.9,2.07,to,words,temp +1.58,1.91,near,words,temp +1.91,2.07,to,words,temp 2.07,2.42,them,words,temp 2.53,2.61,the,words,temp 2.61,3.01,common,words,temp @@ -19,8 +19,8 @@ Begin,End,Label,Type,Speaker 5.54,6.0,not,words,temp 6.0,6.14,by,words,temp 6.14,6.67,distance,words,temp -6.79,7.06,any,words,temp -7.06,7.18,of,words,temp +6.79,7.05,any,words,temp +7.05,7.18,of,words,temp 7.18,7.34,its,words,temp 7.34,7.87,marks,words,temp 0.03,0.06,B,phones,temp @@ -29,22 +29,22 @@ Begin,End,Label,Type,Speaker 0.18,0.23,W,phones,temp 0.23,0.27,EH1,phones,temp 0.27,0.32,N,phones,temp -0.32,0.49,AY1,phones,temp -0.49,0.5,HH,phones,temp -0.5,0.6,AE1,phones,temp +0.32,0.48,AY1,phones,temp +0.48,0.49,HH,phones,temp +0.49,0.6,AE1,phones,temp 0.6,0.64,D,phones,temp 0.64,0.7,AH0,phones,temp 0.7,0.83,P,phones,temp -0.83,0.87,R,phones,temp -0.87,0.99,OW1,phones,temp +0.83,0.88,R,phones,temp +0.88,0.99,OW1,phones,temp 0.99,1.12,CH,phones,temp 1.12,1.19,T,phones,temp 1.22,1.4,S,phones,temp 1.4,1.58,OW1,phones,temp 1.58,1.7,N,phones,temp 1.7,1.84,IH1,phones,temp -1.84,1.9,R,phones,temp -1.9,2.01,T,phones,temp +1.84,1.91,R,phones,temp +1.91,2.01,T,phones,temp 2.01,2.07,AH0,phones,temp 2.07,2.13,DH,phones,temp 2.13,2.3,EH1,phones,temp @@ -75,8 +75,8 @@ Begin,End,Label,Type,Speaker 4.34,4.42,D,phones,temp 4.42,4.45,IH0,phones,temp 4.45,4.59,S,phones,temp -4.59,4.8,IY1,phones,temp -4.8,4.87,V,phones,temp +4.59,4.79,IY1,phones,temp +4.79,4.87,V,phones,temp 4.87,4.97,Z,phones,temp 5.04,5.12,L,phones,temp 5.12,5.33,AO1,phones,temp @@ -96,14 +96,14 @@ Begin,End,Label,Type,Speaker 6.57,6.67,S,phones,temp 6.79,6.89,EH1,phones,temp 6.89,6.95,N,phones,temp -6.95,7.06,IY0,phones,temp -7.06,7.13,AH0,phones,temp +6.95,7.05,IY0,phones,temp +7.05,7.13,AH0,phones,temp 7.13,7.18,V,phones,temp 7.18,7.22,IH0,phones,temp 7.22,7.29,T,phones,temp 7.29,7.34,S,phones,temp 7.34,7.39,M,phones,temp -7.39,7.49,AA1,phones,temp -7.49,7.58,R,phones,temp -7.58,7.69,K,phones,temp -7.69,7.87,S,phones,temp +7.39,7.5,AA1,phones,temp +7.5,7.58,R,phones,temp +7.58,7.7,K,phones,temp +7.7,7.87,S,phones,temp diff --git a/inference_speech_editing.ipynb b/inference_speech_editing.ipynb index 64340f7..67f49cd 100644 --- a/inference_speech_editing.ipynb +++ b/inference_speech_editing.ipynb @@ -8,7 +8,7 @@ "source": [ "import os\n", "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n", - "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"" + "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\"" ] }, { @@ -47,15 +47,17 @@ "# hyperparameters for inference\n", "left_margin = 0.08\n", "right_margin = 0.08\n", - "seed = 1\n", "codec_audio_sr = 16000\n", "codec_sr = 50\n", "top_k = 0\n", "top_p = 0.8\n", "temperature = 1\n", "kvcache = 0\n", - "silence_tokens = [1388,1898,131]\n", - "stop_repetition = -1 # do not stop repetition on silence\n", + "# adjust the below three arguments if the generation is not as good\n", + "seed = 1 # random seed magic\n", + "silence_tokens = [1388,1898,131] # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n", + "stop_repetition = -1 # -1 means do not adjust prob of silence tokens. if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n", + "# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "# point to the original file or record the file\n", @@ -72,7 +74,7 @@ "# run MFA to get the alignment\n", "align_temp = f\"{temp_folder}/mfa_alignments\"\n", "os.makedirs(align_temp, exist_ok=True)\n", - "os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n", + "# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n", "# if it fail, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n", "# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n", "audio_fn = f\"{temp_folder}/{filename}.wav\"\n", @@ -83,15 +85,56 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1)\n" + "original:\n" ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "edited:\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -132,8 +175,15 @@ "mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now\n", "\n", "# load model, tokenizer, and other necessary files\n", - "ckpt_fn = \"/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/pretrained_830M/best_bundle.pth\"\n", - "encodec_fn = \"/data/scratch/pyp/exp_pyp/audiocraft/encodec/xps/6f79c6a8/checkpoint.th\"\n", + "voicecraft_name=\"giga330M.pth\"\n", + "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n", + "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n", + "if not os.path.exists(ckpt_fn):\n", + " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n", + " os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n", + "if not os.path.exists(encodec_fn):\n", + " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n", + " os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n", "ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n", "model = voicecraft.VoiceCraft(ckpt[\"config\"])\n", "model.load_state_dict(ckpt[\"model\"])\n", @@ -155,26 +205,35 @@ "orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu()\n", "# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n", "\n", - "# output_dir\n", - "output_dir = \"./demo/generated_se\"\n", - "os.makedirs(output_dir, exist_ok=True)\n", + "# display the audio\n", + "from IPython.display import Audio\n", + "print(\"original:\")\n", + "display(Audio(orig_audio, rate=codec_audio_sr))\n", "\n", - "save_fn_new = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{seed}.wav\"\n", + "print(\"edited:\")\n", + "display(Audio(new_audio, rate=codec_audio_sr))\n", "\n", - "torchaudio.save(save_fn_new, new_audio, codec_audio_sr)\n", + "# # save the audio\n", + "# # output_dir\n", + "# output_dir = \"./demo/generated_se\"\n", + "# os.makedirs(output_dir, exist_ok=True)\n", "\n", - "save_fn_orig = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav\"\n", - "if not os.path.isfile(save_fn_orig):\n", - " orig_audio, orig_sr = torchaudio.load(audio_fn)\n", - " if orig_sr != codec_audio_sr:\n", - " orig_audio = torchaudio.transforms.Resample(orig_sr, codec_audio_sr)(orig_audio)\n", - " torchaudio.save(save_fn_orig, orig_audio, codec_audio_sr)\n", + "# save_fn_new = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{seed}.wav\"\n", "\n", - "# if you get error importing T5 in transformers\n", - "# try \n", - "# pip uninstall Pillow\n", - "# pip install Pillow\n", - "# you are likely to get warning looks like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored" + "# torchaudio.save(save_fn_new, new_audio, codec_audio_sr)\n", + "\n", + "# save_fn_orig = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav\"\n", + "# if not os.path.isfile(save_fn_orig):\n", + "# orig_audio, orig_sr = torchaudio.load(audio_fn)\n", + "# if orig_sr != codec_audio_sr:\n", + "# orig_audio = torchaudio.transforms.Resample(orig_sr, codec_audio_sr)(orig_audio)\n", + "# torchaudio.save(save_fn_orig, orig_audio, codec_audio_sr)\n", + "\n", + "# # if you get error importing T5 in transformers\n", + "# # try \n", + "# # pip uninstall Pillow\n", + "# # pip install Pillow\n", + "# # you are likely to get warning looks like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored" ] }, { diff --git a/inference_tts.ipynb b/inference_tts.ipynb index 75c25a2..89be66c 100644 --- a/inference_tts.ipynb +++ b/inference_tts.ipynb @@ -8,23 +8,14 @@ "source": [ "import os\n", "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n", - "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"" + "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/pyp/miniconda3/envs/voicecraft/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "# import libs\n", "import torch\n", @@ -42,33 +33,223 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Setting up corpus information\u001b[33m...\u001b[0m \n", + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Loading corpus from source files\u001b[33m...\u001b[0m \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/100 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", + "\u001b[?25h" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Found \u001b[1;36m1\u001b[0m speaker across \u001b[1;36m1\u001b[0m file, average number of utterances per \n", + "\u001b[2;36m \u001b[0m speaker: \u001b[1;36m1.0\u001b[0m \n", + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Initializing multiprocessing jobs\u001b[33m...\u001b[0m \n", + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Normalizing text\u001b[33m...\u001b[0m \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", + "\u001b[?25h" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Creating corpus split for feature generation\u001b[33m...\u001b[0m \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/2 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", + "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating MFCCs\u001b[33m...\u001b[0m \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", + "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Calculating CMVN\u001b[33m...\u001b[0m \n", + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating final features\u001b[33m...\u001b[0m \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", + "\u001b[?25h" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Creating corpus split with features\u001b[33m...\u001b[0m \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", + "\u001b[?25h" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Compiling training graphs\u001b[33m...\u001b[0m \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", + "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Performing first-pass alignment\u001b[33m...\u001b[0m \n", + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating alignments\u001b[33m...\u001b[0m \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", + "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Calculating fMLLR for speaker adaptation\u001b[33m...\u001b[0m \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", + "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Performing second-pass alignment\u001b[33m...\u001b[0m \n", + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating alignments\u001b[33m...\u001b[0m \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", + "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Collecting phone and word alignments from alignment lattices\u001b[33m...\u001b[0m \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", + "\u001b[?25h" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[2;36m \u001b[0m\u001b[33mWARNING \u001b[0m Alignment analysis not available without using postgresql \n", + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Exporting alignment TextGrids to demo/temp/mfa_alignments\u001b[33m...\u001b[0m \n", + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Finished exporting TextGrids to demo/temp/mfa_alignments! \n", + "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Done! Everything took \u001b[1;36m40.634\u001b[0m seconds \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", + "\u001b[?25h" + ] + } + ], "source": [ "# hyperparameters for inference\n", - "left_margin = 0.08\n", - "right_margin = 0.08\n", - "seed = 1\n", + "left_margin = 0.08 # not used for TTS, only for speech editing\n", + "right_margin = 0.08 # not used for TTS, only for speech editing\n", "codec_audio_sr = 16000\n", "codec_sr = 50\n", "top_k = 0\n", "top_p = 0.8\n", "temperature = 1\n", - "kvcache = 0\n", + "kvcache = 1\n", "silence_tokens=[1388,1898,131]\n", - "# if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n", - "stop_repetition = 2\n", - "# if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n", + "# adjust the below three arguments if the generation is not as good\n", + "seed = 1 # random seed magic\n", + "stop_repetition = 3 # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n", + "sample_batch_size = 4 # if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n", "# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest\n", - "sample_batch_size = 1\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "# point to the original file or record the file\n", "# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n", - "orig_audio = \"/home/pyp/VoiceCraft/demo/84_121550_000074_000000.wav\"\n", + "orig_audio = \"./demo/84_121550_000074_000000.wav\"\n", "orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n", "\n", "# move the audio and transcript to temp folder\n", - "temp_folder = \"/home/pyp/VoiceCraft/demo/temp\"\n", + "temp_folder = \"./demo/temp\"\n", "os.makedirs(temp_folder, exist_ok=True)\n", "os.system(f\"cp {orig_audio} {temp_folder}\")\n", "filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n", @@ -87,20 +268,61 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "Dora directory: /tmp/audiocraft_pyp\n" + "concatenate prompt and generated:\n" ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generated:\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ "# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n", - "cut_off_sec = 3.01 # according to forced-alignment file, the word \"common\" stop as 3.01 sec\n", + "cut_off_sec = 3.01 # NOTE: according to forced-alignment file, the word \"common\" stop as 3.01 sec, this should be different for different audio\n", "target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n", "info = torchaudio.info(audio_fn)\n", "audio_dur = info.num_frames / info.sample_rate\n", @@ -111,8 +333,16 @@ "\n", "# # load model, tokenizer, and other necessary files\n", "from models import voicecraft\n", - "ckpt_fn = \"/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/pretrained_830M/best_bundle.pth\"\n", - "encodec_fn = \"/data/scratch/pyp/exp_pyp/audiocraft/encodec/xps/6f79c6a8/checkpoint.th\"\n", + "voicecraft_name=\"giga830M.pth\"\n", + "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n", + "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n", + "if not os.path.exists(ckpt_fn):\n", + " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n", + " os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n", + "if not os.path.exists(encodec_fn):\n", + " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n", + " os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n", + "\n", "ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n", "model = voicecraft.VoiceCraft(ckpt[\"config\"])\n", "model.load_state_dict(ckpt[\"model\"])\n", @@ -133,15 +363,24 @@ "concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()\n", "# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n", "\n", - "# output_dir\n", - "output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n", - "os.makedirs(output_dir, exist_ok=True)\n", "\n", - "seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n", - "seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\" \n", + "# display the audio\n", + "from IPython.display import Audio\n", + "print(\"concatenate prompt and generated:\")\n", + "display(Audio(concated_audio, rate=codec_audio_sr))\n", "\n", - "torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n", - "torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n", + "print(\"generated:\")\n", + "display(Audio(gen_audio, rate=codec_audio_sr))\n", + "\n", + "# # save the audio\n", + "# # output_dir\n", + "# output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n", + "# os.makedirs(output_dir, exist_ok=True)\n", + "# seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n", + "# seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\" \n", + "\n", + "# torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n", + "# torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n", "\n", "# if you get error importing T5 in transformers\n", "# try \n", diff --git a/pretrained_models/.gitkeep b/pretrained_models/.gitkeep new file mode 100644 index 0000000..e69de29