essential gradio app args added, colab notebook fix

2025-06-05 21:49:11 +02:00 · 2024-04-11 16:28:52 +05:00
parent 9344110b0c
commit 6f0aa2db57
3 changed files with 222 additions and 213 deletions
--- a/README.md
+++ b/README.md
@@ -1,9 +1,3 @@
 # VoiceCraft Gradio Colab
 [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Sewlell/VoiceCraft-gradio-colab/blob/master/voicecraft.ipynb)
 Made for those who lacked a dedicated GPU and those who wanted [the friendly GUI by @zuev-stepan](https://github.com/zuev-stepan/VoiceCraft-gradio). Potato programmer brain here so all code credits to @jasonppy, @zuev-stepan and others who contributed in their code. 
 # VoiceCraft: Zero-Shot Speech Editing and Text-to-Speech in the Wild
 [Demo](https://jasonppy.github.io/VoiceCraft_web) [Paper](https://jasonppy.github.io/assets/pdfs/VoiceCraft.pdf)
@@ -105,10 +99,6 @@ conda install -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=
 # to run ipynb
 conda install -n voicecraft ipykernel --no-deps --force-reinstall
 # below is only needed if you want to run gradio_app.py
 sudo apt-get install espeak # NOTE: only required if you want to use gradio_app, which is used by whisperx for forced alignment
 sudo apt-get install libespeak-dev # NOTE: only required if you want to use gradio_app, which is used by whisperx for forced alignment
 ```
 If you have encountered version issues when running things, checkout [environment.yml](./environment.yml) for exact matching.
@@ -117,6 +107,11 @@ If you have encountered version issues when running things, checkout [environmen
 Checkout [`inference_speech_editing.ipynb`](./inference_speech_editing.ipynb) and [`inference_tts.ipynb`](./inference_tts.ipynb)
 ## Gradio
 ### Run in colab
 [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zuev-stepan/VoiceCraft-gradio/blob/feature/colab-notebook/voicecraft-gradio-colab.ipynb)
 ### Run locally
 After environment setup install additional dependencies:
 ```bash
 apt-get install -y espeak espeak-data libespeak1 libespeak-dev
@@ -130,7 +125,6 @@ pip install -r gradio_requirements.txt
 Run gradio server from terminal or [`gradio_app.ipynb`](./gradio_app.ipynb):
 ```bash
 python gradio_app.py
 TMP_PATH=/tmp python gradio_app.py # if you want to change tmp folder path
 ```
 It is ready to use on [default url](http://127.0.0.1:7860).
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -12,12 +12,10 @@ import numpy as np
 import random
 import uuid
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 os.chdir("/content/VoiceCraft-gradio-colab")
 os.environ['USER'] = 'aaa'
 DEMO_PATH = os.getenv("DEMO_PATH", ".demo")
 TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
 MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 whisper_model, align_model, voicecraft_model = None, None, None
@@ -94,14 +92,14 @@ def load_models(whisper_backend_name, whisper_model_name, alignment_model_name,
            transcribe_model = WhisperxModel(whisper_model_name, align_model)
    voicecraft_name = f"{voicecraft_model_name}.pth"
-    ckpt_fn = f"./pretrained_models/{voicecraft_name}"
+    ckpt_fn = f"{MODELS_PATH}/{voicecraft_name}"
-    encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
+    encodec_fn = f"{MODELS_PATH}/encodec_4cb2048_giga.th"
    if not os.path.exists(ckpt_fn):
        os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true")
-        os.system(f"mv {voicecraft_name}\?download\=true ./pretrained_models/{voicecraft_name}")
+        os.system(f"mv {voicecraft_name}\?download\=true {MODELS_PATH}/{voicecraft_name}")
    if not os.path.exists(encodec_fn):
        os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
-        os.system(f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")
+        os.system(f"mv encodec_4cb2048_giga.th {MODELS_PATH}/encodec_4cb2048_giga.th")
    ckpt = torch.load(ckpt_fn, map_location="cpu")
    model = voicecraft.VoiceCraft(ckpt["config"])
@@ -431,6 +429,7 @@ def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_wo
    ]
 def get_app():
    with gr.Blocks() as app:
        with gr.Row():
            with gr.Column(scale=2):
@@ -447,7 +446,7 @@ with gr.Blocks() as app:
        with gr.Row():
            with gr.Column(scale=2):
-            input_audio = gr.Audio(value="./demo/84_121550_000074_000000.wav", label="Input Audio", type="filepath", interactive=True)
+                input_audio = gr.Audio(value=f"{DEMO_PATH}/84_121550_000074_000000.wav", label="Input Audio", type="filepath", interactive=True)
                with gr.Group():
                    original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
                                                    info="Use whisper model to get the transcript. Fix and align it if necessary.")
@@ -595,7 +594,25 @@ with gr.Blocks() as app:
        edit_word_mode.change(fn=update_bound_words,
                            inputs=[edit_from_word, edit_to_word, edit_word_mode],
                            outputs=[edit_start_time, edit_end_time])
    return app
 if __name__ == "__main__":
-    app.launch(share=True)
+    import argparse
    parser = argparse.ArgumentParser(description="VoiceCraft gradio app.")
    parser.add_argument("--demo-path", default=".demo", help="Path to demo directory")
    parser.add_argument("--tmp-path", default=".demo/temp", help="Path to tmp directory")
    parser.add_argument("--models-path", default=".pretrained_models", help="Path to voicecraft models directory")
    parser.add_argument("--port", default=7860, type=int, help="App port")
    parser.add_argument("--share", action="store_true", help="Launch with public url")
    os.environ["USER"] = os.getenv("USER", "user")
    args = parser.parse_args()
    DEMO_PATH = args.demo_path
    TMP_PATH = args.tmp_path
    MODELS_PATH = args.models_path
    app = get_app()
    app.launch(share=args.share, server_port=args.port)
--- a/voicecraft-gradio-colab.ipynb
+++ b/voicecraft-gradio-colab.ipynb
@@ -1,28 +1,10 @@
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "authorship_tag": "ABX9TyPsqFhtOeQ18CXHnRkWAQSk",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
-        "id": "view-in-github",
+        "colab_type": "text",
-        "colab_type": "text"
+        "id": "view-in-github"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/Sewlell/VoiceCraft-gradio-colab/blob/master/voicecraft.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
@@ -36,11 +18,16 @@
      },
      "outputs": [],
      "source": [
-        "!git clone https://github.com/Sewlell/VoiceCraft-gradio-colab"
+        "!git clone https://github.com/zuev-stepan/VoiceCraft-gradio"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-w3USR91XdxY"
      },
      "outputs": [],
      "source": [
        "!pip install tensorboard\n",
        "!pip install phonemizer\n",
@@ -55,25 +42,23 @@
        "\n",
        "!pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft\n",
        "\n",
-        "!pip install -r \"/content/VoiceCraft-gradio-colab/gradio_requirements.txt\""
+        "!pip install -r \"/content/VoiceCraft-gradio/gradio_requirements.txt\""
-      ],
+      ]
      "metadata": {
        "id": "-w3USR91XdxY"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Let it restarted, it won't let your entire installation be aborted."
      ],
      "metadata": {
        "id": "jNuzjrtmv2n1"
-      }
+      },
      "source": [
        "# Let it restarted, it won't let your entire installation be aborted."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AnqGEwZ4NxtJ"
      },
      "source": [
        "# Note before launching the `gradio_app.py`\n",
        "\n",
@@ -83,45 +68,58 @@
        "\n",
        "Frequency of VRAM spikes no longer exist as well in April 5 Update.\n",
        "* Nevermind, I have observed some weird usage on Colab's GPU Memory Monitor. It can spike up to 13.5GB VRAM even in WhisperX mode. (April 11)"
-      ],
+      ]
      "metadata": {
        "id": "AnqGEwZ4NxtJ"
      }
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "dE0W76cMN3Si"
      },
      "source": [
        "Don't make your `prompt end time` too long, 6-9s is fine. Or else it will **either raise up JSON issue or cut off your generated audio**. This one is due to how VoiceCraft worked (so probably unfixable). It will add those text you want to get audio from at the end of the input audio transcript. It was way too much word for application or code to handle as it added up with original transcript. So please keep it short.\n",
        "\n",
        "Your total audio length (`prompt end time` + add-up audio) must not exceed 16 or 17s."
-      ],
+      ]
      "metadata": {
        "id": "dE0W76cMN3Si"
      }
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nnu2cY4t8P6X"
      },
      "source": [
        "For voice cloning, I suggest you to probably have a monotone input to feed the voice cloning. Of course you can always try input that have tons of tone variety, but I find that as per April 11 Update, it's much more easy to replicate in monotone rather than audio that have laugh, scream, crying inside.\n",
        "\n",
-        "The inference speed is much stable. With sample text, T4 (Free Tier Colab GPU) can do 6-15s on 6s-8s of `prompt end time`.\n",
+        "The inference speed is much stable. With sample text, T4 (Free Tier Colab GPU) can do 6-15s on 6s-8s of `prompt end time`."
-        "\n",
+      ]
        "I haven't test the Edit mode yet as those are not my focus, but you can try it."
      ],
      "metadata": {
        "id": "nnu2cY4t8P6X"
      }
    },
    {
      "cell_type": "code",
-      "source": [
+      "execution_count": null,
        "!python \"/content/VoiceCraft-gradio-colab/gradio_app.py\""
      ],
      "metadata": {
        "id": "NDt4r4DiXAwG"
      },
-      "execution_count": null,
+      "outputs": [],
-      "outputs": []
+      "source": [
-    }
+        "!python /content/VoiceCraft-gradio/gradio_app.py --demo-path=/content/VoiceCraft-gradio/demo --tmp-path=/content/VoiceCraft-gradio/demo/temp --models-path=/content/VoiceCraft-gradio/pretrained_models --share"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "authorship_tag": "ABX9TyPsqFhtOeQ18CXHnRkWAQSk",
      "gpuType": "T4",
      "include_colab_link": true,
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }