From 57079c44b6cc02cc392b22e457ced6c2e510d8d4 Mon Sep 17 00:00:00 2001
From: jason-on-salt-a40 <pyp@utexas.edu>
Date: Sat, 13 Apr 2024 15:11:15 -0700
Subject: [PATCH] hf model download

---
 Dockerfile                     |  3 +-
 README.md                      |  1 +
 environment.yml                |  2 +-
 gradio_app.py                  | 27 ++++++--------
 inference_speech_editing.ipynb | 34 +++++++++++-------
 inference_tts.ipynb            | 66 ++++++++++++++++++----------------
 models/voicecraft.py           |  4 +--
 7 files changed, 73 insertions(+), 64 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 3fbe052..f0aa8d3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -20,7 +20,8 @@ RUN conda update -y -n base -c conda-forge conda && \
     conda run -n voicecraft pip install tensorboard==2.16.2 && \
     conda run -n voicecraft pip install phonemizer==3.2.1 && \
     conda run -n voicecraft pip install datasets==2.16.0 && \
-    conda run -n voicecraft pip install torchmetrics==0.11.1
+    conda run -n voicecraft pip install torchmetrics==0.11.1 && \
+    conda run -n voicecraft pip install huggingface_hub==0.22.2
     
 
 # Install the Jupyter kernel
diff --git a/README.md b/README.md
index cd4518e..4ea9ae0 100644
--- a/README.md
+++ b/README.md
@@ -97,6 +97,7 @@ pip install tensorboard==2.16.2
 pip install phonemizer==3.2.1
 pip install datasets==2.16.0
 pip install torchmetrics==0.11.1
+pip install huggingface_hub==0.22.2
 # install MFA for getting forced-alignment, this could take a few minutes
 conda install -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068
 # install MFA english dictionary and model
diff --git a/environment.yml b/environment.yml
index ca0906d..ad1a969 100644
--- a/environment.yml
+++ b/environment.yml
@@ -308,7 +308,7 @@ dependencies:
       - h11==0.14.0
       - httpcore==1.0.4
       - httpx==0.27.0
-      - huggingface-hub==0.21.4
+      - huggingface-hub==0.22.4
       - hydra-colorlog==1.2.0
       - hydra-core==1.3.2
       - ipython==8.12.3
diff --git a/gradio_app.py b/gradio_app.py
index 907ffed..41f64a5 100644
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -92,27 +92,22 @@ def load_models(whisper_backend_name, whisper_model_name, alignment_model_name,
             transcribe_model = WhisperxModel(whisper_model_name, align_model)
 
     voicecraft_name = f"{voicecraft_model_name}.pth"
-    ckpt_fn = f"{MODELS_PATH}/{voicecraft_name}"
+    model = voicecraft.VoiceCraftHF.from_pretrained(f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}")
+    phn2num = model.args.phn2num
+    config = model.args
+    model.to(device)
+
     encodec_fn = f"{MODELS_PATH}/encodec_4cb2048_giga.th"
-    if not os.path.exists(ckpt_fn):
-        os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true")
-        os.system(f"mv {voicecraft_name}\?download\=true {MODELS_PATH}/{voicecraft_name}")
     if not os.path.exists(encodec_fn):
         os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
-        os.system(f"mv encodec_4cb2048_giga.th {MODELS_PATH}/encodec_4cb2048_giga.th")
 
-    ckpt = torch.load(ckpt_fn, map_location="cpu")
-    model = voicecraft.VoiceCraft(ckpt["config"])
-    model.load_state_dict(ckpt["model"])
-    model.to(device)
-    model.eval()
     voicecraft_model = {
-        "ckpt": ckpt,
+        "config": config,
+        "phn2num": phn2num,
         "model": model,
         "text_tokenizer": TextTokenizer(backend="espeak"),
         "audio_tokenizer": AudioTokenizer(signature=encodec_fn)
     }
-
     return gr.Accordion()
 
 
@@ -254,8 +249,8 @@ def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
 
             prompt_end_frame = int(min(audio_dur, prompt_end_time) * info.sample_rate)
             _, gen_audio = inference_one_sample(voicecraft_model["model"],
-                                                voicecraft_model["ckpt"]["config"],
-                                                voicecraft_model["ckpt"]["phn2num"],
+                                                voicecraft_model["config"],
+                                                voicecraft_model["phn2num"],
                                                 voicecraft_model["text_tokenizer"], voicecraft_model["audio_tokenizer"],
                                                 audio_path, target_transcript, device, decode_config,
                                                 prompt_end_frame)
@@ -283,8 +278,8 @@ def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
             mask_interval = torch.LongTensor(mask_interval)
 
             _, gen_audio = inference_one_sample(voicecraft_model["model"],
-                                                voicecraft_model["ckpt"]["config"],
-                                                voicecraft_model["ckpt"]["phn2num"],
+                                                voicecraft_model["config"],
+                                                voicecraft_model["phn2num"],
                                                 voicecraft_model["text_tokenizer"], voicecraft_model["audio_tokenizer"],
                                                 audio_path, target_transcript, mask_interval, device, decode_config)
         gen_audio = gen_audio[0].cpu()
diff --git a/inference_speech_editing.ipynb b/inference_speech_editing.ipynb
index 59b7469..4502966 100644
--- a/inference_speech_editing.ipynb
+++ b/inference_speech_editing.ipynb
@@ -200,25 +200,33 @@
     "mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now\n",
     "\n",
     "# load model, tokenizer, and other necessary files\n",
-    "voicecraft_name=\"giga330M.pth\"\n",
-    "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
+    "voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n",
+    "\n",
+    "# the new way of loading the model, with huggingface, recommended\n",
+    "from models.voicecraft import VoiceCraftHF\n",
+    "model = VoiceCraftHF.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n",
+    "phn2num = model.args.phn2num\n",
+    "config = vars(model.args)\n",
+    "model.to(device)\n",
+    "\n",
+    "# # the old way of loading the model\n",
+    "# from models import voicecraft\n",
+    "# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n",
+    "# ckpt = torch.load(filepath, map_location=\"cpu\")\n",
+    "# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
+    "# model.load_state_dict(ckpt[\"model\"])\n",
+    "# config = vars(model.args)\n",
+    "# phn2num = ckpt[\"phn2num\"]\n",
+    "# model.to(device)\n",
+    "# model.eval()\n",
+    "\n",
     "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
-    "if not os.path.exists(ckpt_fn):\n",
-    "    os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n",
-    "    os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n",
     "if not os.path.exists(encodec_fn):\n",
     "    os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",
     "    os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",
-    "ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n",
-    "model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
-    "model.load_state_dict(ckpt[\"model\"])\n",
-    "model.to(device)\n",
-    "model.eval()\n",
-    "\n",
-    "phn2num = ckpt['phn2num']\n",
+    "audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n",
     "\n",
     "text_tokenizer = TextTokenizer(backend=\"espeak\")\n",
-    "audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n",
     "\n",
     "# run the model to get the output\n",
     "from inference_speech_editing_scale import inference_one_sample\n",
diff --git a/inference_tts.ipynb b/inference_tts.ipynb
index e54368c..f9a1862 100644
--- a/inference_tts.ipynb
+++ b/inference_tts.ipynb
@@ -17,7 +17,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -26,7 +26,7 @@
     "import os\n",
     "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"   \n",
     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
-    "os.environ[\"USER\"] = \"YOUR_USERNAME\" # TODO change this to your username\n",
+    "os.environ[\"USER\"] = \"me\" # TODO change this to your username\n",
     "\n",
     "import torch\n",
     "import torchaudio\n",
@@ -37,52 +37,58 @@
     "from data.tokenizer import (\n",
     "    AudioTokenizer,\n",
     "    TextTokenizer,\n",
-    ")\n"
+    ")\n",
+    "from huggingface_hub import hf_hub_download"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# install MFA models and dictionaries if you haven't done so already, already done in the dockerfile or envrionment setup\n",
-    "!source ~/.bashrc && \\\n",
-    "    conda activate voicecraft && \\\n",
-    "    mfa model download dictionary english_us_arpa && \\\n",
-    "    mfa model download acoustic english_us_arpa"
+    "# # install MFA models and dictionaries if you haven't done so already, already done in the dockerfile or envrionment setup\n",
+    "# !source ~/.bashrc && \\\n",
+    "#     conda activate voicecraft && \\\n",
+    "#     mfa model download dictionary english_us_arpa && \\\n",
+    "#     mfa model download acoustic english_us_arpa"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Dora directory: /tmp/audiocraft_me\n"
+     ]
+    }
+   ],
    "source": [
     "# load model, encodec, and phn2num\n",
     "# # load model, tokenizer, and other necessary files\n",
     "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
     "voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n",
     "\n",
-    "# the old way of loading the model\n",
-    "from models import voicecraft\n",
-    "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
-    "if not os.path.exists(ckpt_fn):\n",
-    "    os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n",
-    "    os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n",
-    "ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n",
-    "model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
-    "model.load_state_dict(ckpt[\"model\"])\n",
-    "phn2num = ckpt['phn2num']\n",
-    "config = vars(ckpt['config'])\n",
+    "# the new way of loading the model, with huggingface, recommended\n",
+    "from models.voicecraft import VoiceCraftHF\n",
+    "model = VoiceCraftHF.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n",
+    "phn2num = model.args.phn2num\n",
+    "config = vars(model.args)\n",
     "model.to(device)\n",
-    "model.eval()\n",
     "\n",
-    "# # the new way of loading the model, with huggingface, this doesn't work yet\n",
-    "# from models.voicecraft import VoiceCraftHF\n",
-    "# model = VoiceCraftHF.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n",
-    "# phn2num = model.args.phn2num # or model.args['phn2num']?\n",
-    "# config = model.config\n",
+    "\n",
+    "# # the old way of loading the model\n",
+    "# from models import voicecraft\n",
+    "# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n",
+    "# ckpt = torch.load(filepath, map_location=\"cpu\")\n",
+    "# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
+    "# model.load_state_dict(ckpt[\"model\"])\n",
+    "# config = vars(model.args)\n",
+    "# phn2num = ckpt[\"phn2num\"]\n",
     "# model.to(device)\n",
     "# model.eval()\n",
     "\n",
@@ -98,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -159,7 +165,7 @@
     "\n",
     "# NOTE adjust the below three arguments if the generation is not as good\n",
     "stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1\n",
-    "sample_batch_size = 4 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
+    "sample_batch_size = 5 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
     "seed = 1 # change seed if you are still unhappy with the result\n",
     "\n",
     "def seed_everything(seed):\n",
diff --git a/models/voicecraft.py b/models/voicecraft.py
index 19b0a25..9bb3393 100644
--- a/models/voicecraft.py
+++ b/models/voicecraft.py
@@ -1416,9 +1416,7 @@ class VoiceCraft(nn.Module):
         return res, flatten_gen[0].unsqueeze(0)
     
 
-class VoiceCraftHF(VoiceCraft, PyTorchModelHubMixin):
-    repo_url="https://github.com/jasonppy/VoiceCraft",
-    tags=["Text-to-Speech", "VoiceCraft"]
+class VoiceCraftHF(VoiceCraft, PyTorchModelHubMixin, repo_url="https://github.com/jasonppy/VoiceCraft", tags=["Text-to-Speech", "VoiceCraft"]):
     def __init__(self, config: dict):
         args = Namespace(**config)
         super().__init__(args)
\ No newline at end of file