hf model download
This commit is contained in:
parent
3a8d5f4aab
commit
57079c44b6
|
@ -20,7 +20,8 @@ RUN conda update -y -n base -c conda-forge conda && \
|
|||
conda run -n voicecraft pip install tensorboard==2.16.2 && \
|
||||
conda run -n voicecraft pip install phonemizer==3.2.1 && \
|
||||
conda run -n voicecraft pip install datasets==2.16.0 && \
|
||||
conda run -n voicecraft pip install torchmetrics==0.11.1
|
||||
conda run -n voicecraft pip install torchmetrics==0.11.1 && \
|
||||
conda run -n voicecraft pip install huggingface_hub==0.22.2
|
||||
|
||||
|
||||
# Install the Jupyter kernel
|
||||
|
|
|
@ -97,6 +97,7 @@ pip install tensorboard==2.16.2
|
|||
pip install phonemizer==3.2.1
|
||||
pip install datasets==2.16.0
|
||||
pip install torchmetrics==0.11.1
|
||||
pip install huggingface_hub==0.22.2
|
||||
# install MFA for getting forced-alignment, this could take a few minutes
|
||||
conda install -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068
|
||||
# install MFA english dictionary and model
|
||||
|
|
|
@ -308,7 +308,7 @@ dependencies:
|
|||
- h11==0.14.0
|
||||
- httpcore==1.0.4
|
||||
- httpx==0.27.0
|
||||
- huggingface-hub==0.21.4
|
||||
- huggingface-hub==0.22.4
|
||||
- hydra-colorlog==1.2.0
|
||||
- hydra-core==1.3.2
|
||||
- ipython==8.12.3
|
||||
|
|
|
@ -92,27 +92,22 @@ def load_models(whisper_backend_name, whisper_model_name, alignment_model_name,
|
|||
transcribe_model = WhisperxModel(whisper_model_name, align_model)
|
||||
|
||||
voicecraft_name = f"{voicecraft_model_name}.pth"
|
||||
ckpt_fn = f"{MODELS_PATH}/{voicecraft_name}"
|
||||
model = voicecraft.VoiceCraftHF.from_pretrained(f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}")
|
||||
phn2num = model.args.phn2num
|
||||
config = model.args
|
||||
model.to(device)
|
||||
|
||||
encodec_fn = f"{MODELS_PATH}/encodec_4cb2048_giga.th"
|
||||
if not os.path.exists(ckpt_fn):
|
||||
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true")
|
||||
os.system(f"mv {voicecraft_name}\?download\=true {MODELS_PATH}/{voicecraft_name}")
|
||||
if not os.path.exists(encodec_fn):
|
||||
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
|
||||
os.system(f"mv encodec_4cb2048_giga.th {MODELS_PATH}/encodec_4cb2048_giga.th")
|
||||
|
||||
ckpt = torch.load(ckpt_fn, map_location="cpu")
|
||||
model = voicecraft.VoiceCraft(ckpt["config"])
|
||||
model.load_state_dict(ckpt["model"])
|
||||
model.to(device)
|
||||
model.eval()
|
||||
voicecraft_model = {
|
||||
"ckpt": ckpt,
|
||||
"config": config,
|
||||
"phn2num": phn2num,
|
||||
"model": model,
|
||||
"text_tokenizer": TextTokenizer(backend="espeak"),
|
||||
"audio_tokenizer": AudioTokenizer(signature=encodec_fn)
|
||||
}
|
||||
|
||||
return gr.Accordion()
|
||||
|
||||
|
||||
|
@ -254,8 +249,8 @@ def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
|||
|
||||
prompt_end_frame = int(min(audio_dur, prompt_end_time) * info.sample_rate)
|
||||
_, gen_audio = inference_one_sample(voicecraft_model["model"],
|
||||
voicecraft_model["ckpt"]["config"],
|
||||
voicecraft_model["ckpt"]["phn2num"],
|
||||
voicecraft_model["config"],
|
||||
voicecraft_model["phn2num"],
|
||||
voicecraft_model["text_tokenizer"], voicecraft_model["audio_tokenizer"],
|
||||
audio_path, target_transcript, device, decode_config,
|
||||
prompt_end_frame)
|
||||
|
@ -283,8 +278,8 @@ def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
|
|||
mask_interval = torch.LongTensor(mask_interval)
|
||||
|
||||
_, gen_audio = inference_one_sample(voicecraft_model["model"],
|
||||
voicecraft_model["ckpt"]["config"],
|
||||
voicecraft_model["ckpt"]["phn2num"],
|
||||
voicecraft_model["config"],
|
||||
voicecraft_model["phn2num"],
|
||||
voicecraft_model["text_tokenizer"], voicecraft_model["audio_tokenizer"],
|
||||
audio_path, target_transcript, mask_interval, device, decode_config)
|
||||
gen_audio = gen_audio[0].cpu()
|
||||
|
|
|
@ -200,25 +200,33 @@
|
|||
"mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now\n",
|
||||
"\n",
|
||||
"# load model, tokenizer, and other necessary files\n",
|
||||
"voicecraft_name=\"giga330M.pth\"\n",
|
||||
"ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
|
||||
"voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n",
|
||||
"\n",
|
||||
"# the new way of loading the model, with huggingface, recommended\n",
|
||||
"from models.voicecraft import VoiceCraftHF\n",
|
||||
"model = VoiceCraftHF.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n",
|
||||
"phn2num = model.args.phn2num\n",
|
||||
"config = vars(model.args)\n",
|
||||
"model.to(device)\n",
|
||||
"\n",
|
||||
"# # the old way of loading the model\n",
|
||||
"# from models import voicecraft\n",
|
||||
"# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n",
|
||||
"# ckpt = torch.load(filepath, map_location=\"cpu\")\n",
|
||||
"# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
|
||||
"# model.load_state_dict(ckpt[\"model\"])\n",
|
||||
"# config = vars(model.args)\n",
|
||||
"# phn2num = ckpt[\"phn2num\"]\n",
|
||||
"# model.to(device)\n",
|
||||
"# model.eval()\n",
|
||||
"\n",
|
||||
"encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
|
||||
"if not os.path.exists(ckpt_fn):\n",
|
||||
" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n",
|
||||
" os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n",
|
||||
"if not os.path.exists(encodec_fn):\n",
|
||||
" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",
|
||||
" os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",
|
||||
"ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n",
|
||||
"model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
|
||||
"model.load_state_dict(ckpt[\"model\"])\n",
|
||||
"model.to(device)\n",
|
||||
"model.eval()\n",
|
||||
"\n",
|
||||
"phn2num = ckpt['phn2num']\n",
|
||||
"audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n",
|
||||
"\n",
|
||||
"text_tokenizer = TextTokenizer(backend=\"espeak\")\n",
|
||||
"audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n",
|
||||
"\n",
|
||||
"# run the model to get the output\n",
|
||||
"from inference_speech_editing_scale import inference_one_sample\n",
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -26,7 +26,7 @@
|
|||
"import os\n",
|
||||
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",
|
||||
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
|
||||
"os.environ[\"USER\"] = \"YOUR_USERNAME\" # TODO change this to your username\n",
|
||||
"os.environ[\"USER\"] = \"me\" # TODO change this to your username\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torchaudio\n",
|
||||
|
@ -37,52 +37,58 @@
|
|||
"from data.tokenizer import (\n",
|
||||
" AudioTokenizer,\n",
|
||||
" TextTokenizer,\n",
|
||||
")\n"
|
||||
")\n",
|
||||
"from huggingface_hub import hf_hub_download"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# install MFA models and dictionaries if you haven't done so already, already done in the dockerfile or envrionment setup\n",
|
||||
"!source ~/.bashrc && \\\n",
|
||||
" conda activate voicecraft && \\\n",
|
||||
" mfa model download dictionary english_us_arpa && \\\n",
|
||||
" mfa model download acoustic english_us_arpa"
|
||||
"# # install MFA models and dictionaries if you haven't done so already, already done in the dockerfile or envrionment setup\n",
|
||||
"# !source ~/.bashrc && \\\n",
|
||||
"# conda activate voicecraft && \\\n",
|
||||
"# mfa model download dictionary english_us_arpa && \\\n",
|
||||
"# mfa model download acoustic english_us_arpa"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dora directory: /tmp/audiocraft_me\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# load model, encodec, and phn2num\n",
|
||||
"# # load model, tokenizer, and other necessary files\n",
|
||||
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n",
|
||||
"\n",
|
||||
"# the old way of loading the model\n",
|
||||
"from models import voicecraft\n",
|
||||
"ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
|
||||
"if not os.path.exists(ckpt_fn):\n",
|
||||
" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n",
|
||||
" os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n",
|
||||
"ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n",
|
||||
"model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
|
||||
"model.load_state_dict(ckpt[\"model\"])\n",
|
||||
"phn2num = ckpt['phn2num']\n",
|
||||
"config = vars(ckpt['config'])\n",
|
||||
"# the new way of loading the model, with huggingface, recommended\n",
|
||||
"from models.voicecraft import VoiceCraftHF\n",
|
||||
"model = VoiceCraftHF.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n",
|
||||
"phn2num = model.args.phn2num\n",
|
||||
"config = vars(model.args)\n",
|
||||
"model.to(device)\n",
|
||||
"model.eval()\n",
|
||||
"\n",
|
||||
"# # the new way of loading the model, with huggingface, this doesn't work yet\n",
|
||||
"# from models.voicecraft import VoiceCraftHF\n",
|
||||
"# model = VoiceCraftHF.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n",
|
||||
"# phn2num = model.args.phn2num # or model.args['phn2num']?\n",
|
||||
"# config = model.config\n",
|
||||
"\n",
|
||||
"# # the old way of loading the model\n",
|
||||
"# from models import voicecraft\n",
|
||||
"# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n",
|
||||
"# ckpt = torch.load(filepath, map_location=\"cpu\")\n",
|
||||
"# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
|
||||
"# model.load_state_dict(ckpt[\"model\"])\n",
|
||||
"# config = vars(model.args)\n",
|
||||
"# phn2num = ckpt[\"phn2num\"]\n",
|
||||
"# model.to(device)\n",
|
||||
"# model.eval()\n",
|
||||
"\n",
|
||||
|
@ -98,7 +104,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -159,7 +165,7 @@
|
|||
"\n",
|
||||
"# NOTE adjust the below three arguments if the generation is not as good\n",
|
||||
"stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1\n",
|
||||
"sample_batch_size = 4 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
|
||||
"sample_batch_size = 5 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
|
||||
"seed = 1 # change seed if you are still unhappy with the result\n",
|
||||
"\n",
|
||||
"def seed_everything(seed):\n",
|
||||
|
|
|
@ -1416,9 +1416,7 @@ class VoiceCraft(nn.Module):
|
|||
return res, flatten_gen[0].unsqueeze(0)
|
||||
|
||||
|
||||
class VoiceCraftHF(VoiceCraft, PyTorchModelHubMixin):
|
||||
repo_url="https://github.com/jasonppy/VoiceCraft",
|
||||
tags=["Text-to-Speech", "VoiceCraft"]
|
||||
class VoiceCraftHF(VoiceCraft, PyTorchModelHubMixin, repo_url="https://github.com/jasonppy/VoiceCraft", tags=["Text-to-Speech", "VoiceCraft"]):
|
||||
def __init__(self, config: dict):
|
||||
args = Namespace(**config)
|
||||
super().__init__(args)
|
Loading…
Reference in New Issue