Merge branch 'master' of https://github.com/chenxwh/VoiceCraft
This commit is contained in:
commit
ef3dd8285b
|
@ -17,6 +17,7 @@ thumbs.db
|
|||
*.mp3
|
||||
*.pth
|
||||
*.th
|
||||
*.json
|
||||
|
||||
*durip*
|
||||
*rtx*
|
||||
|
|
|
@ -2,4 +2,5 @@ gradio==3.50.2
|
|||
nltk>=3.8.1
|
||||
openai-whisper>=20231117
|
||||
aeneas>=1.7.3.0
|
||||
whisperx>=3.1.1
|
||||
whisperx>=3.1.1
|
||||
huggingface_hub==0.22.2
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
"import torchaudio\n",
|
||||
"import numpy as np\n",
|
||||
"import random\n",
|
||||
"from argparse import Namespace\n",
|
||||
"\n",
|
||||
"from data.tokenizer import (\n",
|
||||
" AudioTokenizer,\n",
|
||||
|
@ -84,6 +85,34 @@
|
|||
" torch.backends.cudnn.deterministic = True\n",
|
||||
"seed_everything(seed)\n",
|
||||
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"# load model, tokenizer, and other necessary files\n",
|
||||
"voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n",
|
||||
"\n",
|
||||
"# the new way of loading the model, with huggingface, recommended\n",
|
||||
"from models import voicecraft\n",
|
||||
"model = voicecraft.VoiceCraft.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n",
|
||||
"phn2num = model.args.phn2num\n",
|
||||
"config = vars(model.args)\n",
|
||||
"model.to(device)\n",
|
||||
"\n",
|
||||
"# # the old way of loading the model\n",
|
||||
"# from models import voicecraft\n",
|
||||
"# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n",
|
||||
"# ckpt = torch.load(filepath, map_location=\"cpu\")\n",
|
||||
"# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
|
||||
"# model.load_state_dict(ckpt[\"model\"])\n",
|
||||
"# config = vars(model.args)\n",
|
||||
"# phn2num = ckpt[\"phn2num\"]\n",
|
||||
"# model.to(device)\n",
|
||||
"# model.eval()\n",
|
||||
"\n",
|
||||
"encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
|
||||
"if not os.path.exists(encodec_fn):\n",
|
||||
" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",
|
||||
" os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",
|
||||
"audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n",
|
||||
"\n",
|
||||
"text_tokenizer = TextTokenizer(backend=\"espeak\")\n",
|
||||
"\n",
|
||||
"# point to the original file or record the file\n",
|
||||
"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
|
||||
|
@ -199,40 +228,13 @@
|
|||
"mask_interval = [[round(morphed_span[0]*codec_sr), round(morphed_span[1]*codec_sr)]]\n",
|
||||
"mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now\n",
|
||||
"\n",
|
||||
"# load model, tokenizer, and other necessary files\n",
|
||||
"voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n",
|
||||
"\n",
|
||||
"# the new way of loading the model, with huggingface, recommended\n",
|
||||
"from models import voicecraft\n",
|
||||
"model = voicecraft.VoiceCraft.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n",
|
||||
"phn2num = model.args.phn2num\n",
|
||||
"config = vars(model.args)\n",
|
||||
"model.to(device)\n",
|
||||
"\n",
|
||||
"# # the old way of loading the model\n",
|
||||
"# from models import voicecraft\n",
|
||||
"# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n",
|
||||
"# ckpt = torch.load(filepath, map_location=\"cpu\")\n",
|
||||
"# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
|
||||
"# model.load_state_dict(ckpt[\"model\"])\n",
|
||||
"# config = vars(model.args)\n",
|
||||
"# phn2num = ckpt[\"phn2num\"]\n",
|
||||
"# model.to(device)\n",
|
||||
"# model.eval()\n",
|
||||
"\n",
|
||||
"encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
|
||||
"if not os.path.exists(encodec_fn):\n",
|
||||
" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",
|
||||
" os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",
|
||||
"audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n",
|
||||
"\n",
|
||||
"text_tokenizer = TextTokenizer(backend=\"espeak\")\n",
|
||||
"\n",
|
||||
"# run the model to get the output\n",
|
||||
"from inference_speech_editing_scale import inference_one_sample\n",
|
||||
"\n",
|
||||
"decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens}\n",
|
||||
"orig_audio, new_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, mask_interval, device, decode_config)\n",
|
||||
"orig_audio, new_audio = inference_one_sample(model, Namespace(**config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, mask_interval, device, decode_config)\n",
|
||||
" \n",
|
||||
"# save segments for comparison\n",
|
||||
"orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu()\n",
|
||||
|
|
Loading…
Reference in New Issue