From ddfef8333147c3c988f439bf3a24394d8e189113 Mon Sep 17 00:00:00 2001 From: Puyuan Peng <47729801+jasonppy@users.noreply.github.com> Date: Wed, 17 Apr 2024 14:19:07 -0500 Subject: [PATCH 1/2] Update gradio_requirements.txt --- gradio_requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gradio_requirements.txt b/gradio_requirements.txt index 967b3d7..e9e7635 100644 --- a/gradio_requirements.txt +++ b/gradio_requirements.txt @@ -2,4 +2,5 @@ gradio==3.50.2 nltk>=3.8.1 openai-whisper>=20231117 aeneas>=1.7.3.0 -whisperx>=3.1.1 \ No newline at end of file +whisperx>=3.1.1 +huggingface_hub==0.22.2 From 7efcb3ee663e083f743bf80a86c9a3a98829261b Mon Sep 17 00:00:00 2001 From: jason-on-salt-a40 Date: Thu, 18 Apr 2024 12:38:55 -0700 Subject: [PATCH 2/2] fix editing notebook --- .gitignore | 1 + inference_speech_editing.ipynb | 58 ++++++++++++++++++---------------- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index 17dbc9b..b88bfef 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ thumbs.db *.mp3 *.pth *.th +*.json *durip* *rtx* diff --git a/inference_speech_editing.ipynb b/inference_speech_editing.ipynb index 022e3ba..a0b5cd5 100644 --- a/inference_speech_editing.ipynb +++ b/inference_speech_editing.ipynb @@ -32,6 +32,7 @@ "import torchaudio\n", "import numpy as np\n", "import random\n", + "from argparse import Namespace\n", "\n", "from data.tokenizer import (\n", " AudioTokenizer,\n", @@ -84,6 +85,34 @@ " torch.backends.cudnn.deterministic = True\n", "seed_everything(seed)\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "# load model, tokenizer, and other necessary files\n", + "voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n", + "\n", + "# the new way of loading the model, with huggingface, recommended\n", + "from models import voicecraft\n", + "model = voicecraft.VoiceCraft.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n", + "phn2num = model.args.phn2num\n", + "config = vars(model.args)\n", + "model.to(device)\n", + "\n", + "# # the old way of loading the model\n", + "# from models import voicecraft\n", + "# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n", + "# ckpt = torch.load(filepath, map_location=\"cpu\")\n", + "# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n", + "# model.load_state_dict(ckpt[\"model\"])\n", + "# config = vars(model.args)\n", + "# phn2num = ckpt[\"phn2num\"]\n", + "# model.to(device)\n", + "# model.eval()\n", + "\n", + "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n", + "if not os.path.exists(encodec_fn):\n", + " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n", + " os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n", + "audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n", + "\n", + "text_tokenizer = TextTokenizer(backend=\"espeak\")\n", "\n", "# point to the original file or record the file\n", "# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n", @@ -199,40 +228,13 @@ "mask_interval = [[round(morphed_span[0]*codec_sr), round(morphed_span[1]*codec_sr)]]\n", "mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now\n", "\n", - "# load model, tokenizer, and other necessary files\n", - "voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n", "\n", - "# the new way of loading the model, with huggingface, recommended\n", - "from models import voicecraft\n", - "model = voicecraft.VoiceCraft.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n", - "phn2num = model.args.phn2num\n", - "config = vars(model.args)\n", - "model.to(device)\n", - "\n", - "# # the old way of loading the model\n", - "# from models import voicecraft\n", - "# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n", - "# ckpt = torch.load(filepath, map_location=\"cpu\")\n", - "# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n", - "# model.load_state_dict(ckpt[\"model\"])\n", - "# config = vars(model.args)\n", - "# phn2num = ckpt[\"phn2num\"]\n", - "# model.to(device)\n", - "# model.eval()\n", - "\n", - "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n", - "if not os.path.exists(encodec_fn):\n", - " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n", - " os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n", - "audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n", - "\n", - "text_tokenizer = TextTokenizer(backend=\"espeak\")\n", "\n", "# run the model to get the output\n", "from inference_speech_editing_scale import inference_one_sample\n", "\n", "decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens}\n", - "orig_audio, new_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, mask_interval, device, decode_config)\n", + "orig_audio, new_audio = inference_one_sample(model, Namespace(**config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, mask_interval, device, decode_config)\n", " \n", "# save segments for comparison\n", "orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu()\n",