From 57079c44b6cc02cc392b22e457ced6c2e510d8d4 Mon Sep 17 00:00:00 2001 From: jason-on-salt-a40 Date: Sat, 13 Apr 2024 15:11:15 -0700 Subject: [PATCH] hf model download --- Dockerfile | 3 +- README.md | 1 + environment.yml | 2 +- gradio_app.py | 27 ++++++-------- inference_speech_editing.ipynb | 34 +++++++++++------- inference_tts.ipynb | 66 ++++++++++++++++++---------------- models/voicecraft.py | 4 +-- 7 files changed, 73 insertions(+), 64 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3fbe052..f0aa8d3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,7 +20,8 @@ RUN conda update -y -n base -c conda-forge conda && \ conda run -n voicecraft pip install tensorboard==2.16.2 && \ conda run -n voicecraft pip install phonemizer==3.2.1 && \ conda run -n voicecraft pip install datasets==2.16.0 && \ - conda run -n voicecraft pip install torchmetrics==0.11.1 + conda run -n voicecraft pip install torchmetrics==0.11.1 && \ + conda run -n voicecraft pip install huggingface_hub==0.22.2 # Install the Jupyter kernel diff --git a/README.md b/README.md index cd4518e..4ea9ae0 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,7 @@ pip install tensorboard==2.16.2 pip install phonemizer==3.2.1 pip install datasets==2.16.0 pip install torchmetrics==0.11.1 +pip install huggingface_hub==0.22.2 # install MFA for getting forced-alignment, this could take a few minutes conda install -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 # install MFA english dictionary and model diff --git a/environment.yml b/environment.yml index ca0906d..ad1a969 100644 --- a/environment.yml +++ b/environment.yml @@ -308,7 +308,7 @@ dependencies: - h11==0.14.0 - httpcore==1.0.4 - httpx==0.27.0 - - huggingface-hub==0.21.4 + - huggingface-hub==0.22.4 - hydra-colorlog==1.2.0 - hydra-core==1.3.2 - ipython==8.12.3 diff --git a/gradio_app.py b/gradio_app.py index 907ffed..41f64a5 100644 --- a/gradio_app.py +++ b/gradio_app.py @@ -92,27 +92,22 @@ def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, transcribe_model = WhisperxModel(whisper_model_name, align_model) voicecraft_name = f"{voicecraft_model_name}.pth" - ckpt_fn = f"{MODELS_PATH}/{voicecraft_name}" + model = voicecraft.VoiceCraftHF.from_pretrained(f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}") + phn2num = model.args.phn2num + config = model.args + model.to(device) + encodec_fn = f"{MODELS_PATH}/encodec_4cb2048_giga.th" - if not os.path.exists(ckpt_fn): - os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true") - os.system(f"mv {voicecraft_name}\?download\=true {MODELS_PATH}/{voicecraft_name}") if not os.path.exists(encodec_fn): os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th") - os.system(f"mv encodec_4cb2048_giga.th {MODELS_PATH}/encodec_4cb2048_giga.th") - ckpt = torch.load(ckpt_fn, map_location="cpu") - model = voicecraft.VoiceCraft(ckpt["config"]) - model.load_state_dict(ckpt["model"]) - model.to(device) - model.eval() voicecraft_model = { - "ckpt": ckpt, + "config": config, + "phn2num": phn2num, "model": model, "text_tokenizer": TextTokenizer(backend="espeak"), "audio_tokenizer": AudioTokenizer(signature=encodec_fn) } - return gr.Accordion() @@ -254,8 +249,8 @@ def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, prompt_end_frame = int(min(audio_dur, prompt_end_time) * info.sample_rate) _, gen_audio = inference_one_sample(voicecraft_model["model"], - voicecraft_model["ckpt"]["config"], - voicecraft_model["ckpt"]["phn2num"], + voicecraft_model["config"], + voicecraft_model["phn2num"], voicecraft_model["text_tokenizer"], voicecraft_model["audio_tokenizer"], audio_path, target_transcript, device, decode_config, prompt_end_frame) @@ -283,8 +278,8 @@ def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p, mask_interval = torch.LongTensor(mask_interval) _, gen_audio = inference_one_sample(voicecraft_model["model"], - voicecraft_model["ckpt"]["config"], - voicecraft_model["ckpt"]["phn2num"], + voicecraft_model["config"], + voicecraft_model["phn2num"], voicecraft_model["text_tokenizer"], voicecraft_model["audio_tokenizer"], audio_path, target_transcript, mask_interval, device, decode_config) gen_audio = gen_audio[0].cpu() diff --git a/inference_speech_editing.ipynb b/inference_speech_editing.ipynb index 59b7469..4502966 100644 --- a/inference_speech_editing.ipynb +++ b/inference_speech_editing.ipynb @@ -200,25 +200,33 @@ "mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now\n", "\n", "# load model, tokenizer, and other necessary files\n", - "voicecraft_name=\"giga330M.pth\"\n", - "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n", + "voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n", + "\n", + "# the new way of loading the model, with huggingface, recommended\n", + "from models.voicecraft import VoiceCraftHF\n", + "model = VoiceCraftHF.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n", + "phn2num = model.args.phn2num\n", + "config = vars(model.args)\n", + "model.to(device)\n", + "\n", + "# # the old way of loading the model\n", + "# from models import voicecraft\n", + "# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n", + "# ckpt = torch.load(filepath, map_location=\"cpu\")\n", + "# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n", + "# model.load_state_dict(ckpt[\"model\"])\n", + "# config = vars(model.args)\n", + "# phn2num = ckpt[\"phn2num\"]\n", + "# model.to(device)\n", + "# model.eval()\n", + "\n", "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n", - "if not os.path.exists(ckpt_fn):\n", - " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n", - " os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n", "if not os.path.exists(encodec_fn):\n", " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n", " os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n", - "ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n", - "model = voicecraft.VoiceCraft(ckpt[\"config\"])\n", - "model.load_state_dict(ckpt[\"model\"])\n", - "model.to(device)\n", - "model.eval()\n", - "\n", - "phn2num = ckpt['phn2num']\n", + "audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n", "\n", "text_tokenizer = TextTokenizer(backend=\"espeak\")\n", - "audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n", "\n", "# run the model to get the output\n", "from inference_speech_editing_scale import inference_one_sample\n", diff --git a/inference_tts.ipynb b/inference_tts.ipynb index e54368c..f9a1862 100644 --- a/inference_tts.ipynb +++ b/inference_tts.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ "import os\n", "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n", "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", - "os.environ[\"USER\"] = \"YOUR_USERNAME\" # TODO change this to your username\n", + "os.environ[\"USER\"] = \"me\" # TODO change this to your username\n", "\n", "import torch\n", "import torchaudio\n", @@ -37,52 +37,58 @@ "from data.tokenizer import (\n", " AudioTokenizer,\n", " TextTokenizer,\n", - ")\n" + ")\n", + "from huggingface_hub import hf_hub_download" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# install MFA models and dictionaries if you haven't done so already, already done in the dockerfile or envrionment setup\n", - "!source ~/.bashrc && \\\n", - " conda activate voicecraft && \\\n", - " mfa model download dictionary english_us_arpa && \\\n", - " mfa model download acoustic english_us_arpa" + "# # install MFA models and dictionaries if you haven't done so already, already done in the dockerfile or envrionment setup\n", + "# !source ~/.bashrc && \\\n", + "# conda activate voicecraft && \\\n", + "# mfa model download dictionary english_us_arpa && \\\n", + "# mfa model download acoustic english_us_arpa" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Dora directory: /tmp/audiocraft_me\n" + ] + } + ], "source": [ "# load model, encodec, and phn2num\n", "# # load model, tokenizer, and other necessary files\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n", "\n", - "# the old way of loading the model\n", - "from models import voicecraft\n", - "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n", - "if not os.path.exists(ckpt_fn):\n", - " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n", - " os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n", - "ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n", - "model = voicecraft.VoiceCraft(ckpt[\"config\"])\n", - "model.load_state_dict(ckpt[\"model\"])\n", - "phn2num = ckpt['phn2num']\n", - "config = vars(ckpt['config'])\n", + "# the new way of loading the model, with huggingface, recommended\n", + "from models.voicecraft import VoiceCraftHF\n", + "model = VoiceCraftHF.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n", + "phn2num = model.args.phn2num\n", + "config = vars(model.args)\n", "model.to(device)\n", - "model.eval()\n", "\n", - "# # the new way of loading the model, with huggingface, this doesn't work yet\n", - "# from models.voicecraft import VoiceCraftHF\n", - "# model = VoiceCraftHF.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n", - "# phn2num = model.args.phn2num # or model.args['phn2num']?\n", - "# config = model.config\n", + "\n", + "# # the old way of loading the model\n", + "# from models import voicecraft\n", + "# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n", + "# ckpt = torch.load(filepath, map_location=\"cpu\")\n", + "# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n", + "# model.load_state_dict(ckpt[\"model\"])\n", + "# config = vars(model.args)\n", + "# phn2num = ckpt[\"phn2num\"]\n", "# model.to(device)\n", "# model.eval()\n", "\n", @@ -98,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -159,7 +165,7 @@ "\n", "# NOTE adjust the below three arguments if the generation is not as good\n", "stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1\n", - "sample_batch_size = 4 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n", + "sample_batch_size = 5 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n", "seed = 1 # change seed if you are still unhappy with the result\n", "\n", "def seed_everything(seed):\n", diff --git a/models/voicecraft.py b/models/voicecraft.py index 19b0a25..9bb3393 100644 --- a/models/voicecraft.py +++ b/models/voicecraft.py @@ -1416,9 +1416,7 @@ class VoiceCraft(nn.Module): return res, flatten_gen[0].unsqueeze(0) -class VoiceCraftHF(VoiceCraft, PyTorchModelHubMixin): - repo_url="https://github.com/jasonppy/VoiceCraft", - tags=["Text-to-Speech", "VoiceCraft"] +class VoiceCraftHF(VoiceCraft, PyTorchModelHubMixin, repo_url="https://github.com/jasonppy/VoiceCraft", tags=["Text-to-Speech", "VoiceCraft"]): def __init__(self, config: dict): args = Namespace(**config) super().__init__(args) \ No newline at end of file