README update, gradio_app.ipynb update, debug print removed

2025-06-05 21:49:11 +02:00 · 2024-04-05 04:40:57 +03:00
parent bbe3437b8d
commit 94e9f9bd42
3 changed files with 21 additions and 76 deletions
--- a/README.md
+++ b/README.md
@@ -96,6 +96,11 @@ Checkout [`inference_speech_editing.ipynb`](./inference_speech_editing.ipynb) an
 ## Gradio
 After environment setup install additional dependencies:
 ```bash
 apt-get install -y espeak espeak-data libespeak1 libespeak-dev
 apt-get install -y festival*
 apt-get install -y build-essential
 apt-get install -y flac libasound2-dev libsndfile1-dev vorbis-tools
 apt-get install -y libxml2-dev libxslt-dev zlib1g-dev
 pip install -r gradio_requirements.txt
 ```
--- a/gradio_app.ipynb
+++ b/gradio_app.ipynb
@@ -8,84 +8,28 @@
    "### Only do the below if you are using docker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "270aa2cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# install OS deps\n",
    "!sudo apt-get update && sudo apt-get install -y \\\n",
    "    git-core \\\n",
    "    ffmpeg \\\n",
    "    espeak-ng"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ba5f452",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Update and setup Conda voicecraft environment\n",
    "!conda update -y -n base -c conda-forge conda\n",
    "!conda create -y -n voicecraft python=3.9.16 && \\\n",
    "    conda init bash"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ef2935c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# install conda and pip stuff in the activated conda above context\n",
    "!echo -e \"Grab a cup a coffee and a slice of pizza...\\n\\n\"\n",
    "\n",
    "# make sure $HOME and $USER are setup so this will source the conda environment\n",
    "!source ~/.bashrc && \\\n",
    "    conda activate voicecraft && \\\n",
    "    conda install -y -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 && \\\n",
    "    pip install torch==2.0.1 && \\\n",
    "    pip install tensorboard==2.16.2 && \\\n",
    "    pip install phonemizer==3.2.1 && \\\n",
    "    pip install torchaudio==2.0.2 && \\\n",
    "    pip install datasets==2.16.0 && \\\n",
    "    pip install torchmetrics==0.11.1\n",
    "\n",
    "# do this one last otherwise you'll get an error about torch compiler missing due to xformer mismatch\n",
    "!source ~/.bashrc && \\\n",
    "    conda activate voicecraft && \\\n",
    "    pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2fca57eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# okay setup the conda environment such that jupyter notebook can find the kernel\n",
    "!source ~/.bashrc && \\\n",
    "    conda activate voicecraft && \\\n",
    "    conda install -y -n voicecraft ipykernel --update-deps --force-reinstall\n",
    "\n",
    "# installs the Jupyter kernel into /home/myusername/.local/share/jupyter/kernels/voicecraft\n",
    "!source ~/.bashrc && \\\n",
    "    conda activate voicecraft && \\\n",
    "    python3 -m ipykernel install --user --name=voicecraft"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "961faa43",
   "metadata": {},
   "outputs": [],
   "source": [
    "!source ~/.bashrc && \\\n",
    "    apt-get update && \\\n",
    "    apt-get install -y espeak espeak-data libespeak1 libespeak-dev && \\\n",
    "    apt-get install -y festival* && \\\n",
    "    apt-get install -y build-essential && \\\n",
    "    apt-get install -y flac libasound2-dev libsndfile1-dev vorbis-tools && \\\n",
    "    apt-get install -y libxml2-dev libxslt-dev zlib1g-dev"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "598d75cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "!source ~/.bashrc && \\\n",
    "    conda activate voicecraft && \\\n",
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -75,9 +75,6 @@ class WhisperxModel:
 def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
    global transcribe_model, align_model, voicecraft_model
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    if alignment_model_name is not None:
        align_model = WhisperxAlignModel()
@@ -178,7 +175,6 @@ def align(seed, transcript, audio_path):
    } for fragment in fragments["fragments"]]
    segments = align_model.align(segments, audio_path)
    state = get_transcribe_state(segments)
    print(state)
    return [
        state["transcript_with_start_time"], state["transcript_with_end_time"],