Merge 3d3f32ba7e into 2506954b64

2024-04-04 16:00:49 -04:00
3 changed files with 92 additions and 44 deletions
--- a/README.md
+++ b/README.md
@ -86,10 +86,6 @@ conda install -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=

 # to run ipynb
 conda install -n voicecraft ipykernel --no-deps --force-reinstall
-
-# below is only needed if you want to run gradio_app.py
-sudo apt-get install espeak # NOTE: only required if you want to use gradio_app, which is used by whisperx for forced alignment
-sudo apt-get install libespeak-dev # NOTE: only required if you want to use gradio_app, which is used by whisperx for forced alignment
 ```

 If you have encountered version issues when running things, checkout [environment.yml](./environment.yml) for exact matching.
@ -100,18 +96,12 @@ Checkout [`inference_speech_editing.ipynb`](./inference_speech_editing.ipynb) an
 ## Gradio
 After environment setup install additional dependencies:
 ```bash
-apt-get install -y espeak espeak-data libespeak1 libespeak-dev
-apt-get install -y festival*
-apt-get install -y build-essential
-apt-get install -y flac libasound2-dev libsndfile1-dev vorbis-tools
-apt-get install -y libxml2-dev libxslt-dev zlib1g-dev
 pip install -r gradio_requirements.txt
 ```

 Run gradio server from terminal or [`gradio_app.ipynb`](./gradio_app.ipynb):
 ```bash
 python gradio_app.py
-TMP_PATH=/tmp python gradio_app.py # if you want to change tmp folder path
 ```
 It is ready to use on [default url](http://127.0.0.1:7860).

--- a/gradio_app.ipynb
+++ b/gradio_app.ipynb
@ -11,23 +11,79 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "961faa43",
+   "id": "270aa2cc",
   "metadata": {},
   "outputs": [],
   "source": [
-    "!source ~/.bashrc && \\\n",
-    "    apt-get update && \\\n",
-    "    apt-get install -y espeak espeak-data libespeak1 libespeak-dev && \\\n",
-    "    apt-get install -y festival* && \\\n",
-    "    apt-get install -y build-essential && \\\n",
-    "    apt-get install -y flac libasound2-dev libsndfile1-dev vorbis-tools && \\\n",
-    "    apt-get install -y libxml2-dev libxslt-dev zlib1g-dev"
+    "# install OS deps\n",
+    "!sudo apt-get update && sudo apt-get install -y \\\n",
+    "    git-core \\\n",
+    "    ffmpeg \\\n",
+    "    espeak-ng"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "598d75cf",
+   "id": "8ba5f452",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Update and setup Conda voicecraft environment\n",
+    "!conda update -y -n base -c conda-forge conda\n",
+    "!conda create -y -n voicecraft python=3.9.16 && \\\n",
+    "    conda init bash"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ef2935c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# install conda and pip stuff in the activated conda above context\n",
+    "!echo -e \"Grab a cup a coffee and a slice of pizza...\\n\\n\"\n",
+    "\n",
+    "# make sure $HOME and $USER are setup so this will source the conda environment\n",
+    "!source ~/.bashrc && \\\n",
+    "    conda activate voicecraft && \\\n",
+    "    conda install -y -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 && \\\n",
+    "    pip install torch==2.0.1 && \\\n",
+    "    pip install tensorboard==2.16.2 && \\\n",
+    "    pip install phonemizer==3.2.1 && \\\n",
+    "    pip install torchaudio==2.0.2 && \\\n",
+    "    pip install datasets==2.16.0 && \\\n",
+    "    pip install torchmetrics==0.11.1\n",
+    "\n",
+    "# do this one last otherwise you'll get an error about torch compiler missing due to xformer mismatch\n",
+    "!source ~/.bashrc && \\\n",
+    "    conda activate voicecraft && \\\n",
+    "    pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2fca57eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# okay setup the conda environment such that jupyter notebook can find the kernel\n",
+    "!source ~/.bashrc && \\\n",
+    "    conda activate voicecraft && \\\n",
+    "    conda install -y -n voicecraft ipykernel --update-deps --force-reinstall\n",
+    "\n",
+    "# installs the Jupyter kernel into /home/myusername/.local/share/jupyter/kernels/voicecraft\n",
+    "!source ~/.bashrc && \\\n",
+    "    conda activate voicecraft && \\\n",
+    "    python3 -m ipykernel install --user --name=voicecraft"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "961faa43",
   "metadata": {},
   "outputs": [],
   "source": [
--- a/gradio_app.py
+++ b/gradio_app.py
@ -1,6 +1,3 @@
-import os
-# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-# os.environ["CUDA_VISIBLE_DEVICES"] = "0" # for local use
 import gradio as gr
 import torch
 import torchaudio
@ -9,13 +6,14 @@ from data.tokenizer import (
    TextTokenizer,
 )
 from models import voicecraft
+import os
 import io
 import numpy as np
 import random
 import uuid


-TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
+TMP_PATH = "./demo/temp"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 whisper_model, align_model, voicecraft_model = None, None, None

@ -66,7 +64,7 @@ class WhisperModel:
 class WhisperxModel:
    def __init__(self, model_name, align_model: WhisperxAlignModel):
        from whisperx import load_model
-        self.model = load_model(model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None})
+        self.model = load_model(model_name, device, asr_options={"suppress_numerals": True})
        self.align_model = align_model

    def transcribe(self, audio_path):
@ -77,6 +75,9 @@ class WhisperxModel:
 def load_models(whisper_backend_name, whisper_model_name, alignment_model_name, voicecraft_model_name):
    global transcribe_model, align_model, voicecraft_model

+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
    if alignment_model_name is not None:
        align_model = WhisperxAlignModel()

@ -177,6 +178,7 @@ def align(seed, transcript, audio_path):
    } for fragment in fragments["fragments"]]
    segments = align_model.align(segments, audio_path)
    state = get_transcribe_state(segments)
+    print(state)

    return [
        state["transcript_with_start_time"], state["transcript_with_end_time"],
@ -235,10 +237,10 @@ def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
                target_transcript = ""
                for word in transcribe_state["words_info"]:
                    if word["end"] < prompt_end_time:
-                        target_transcript += word["word"] + (" " if word["word"][-1] != " " else "")
+                        target_transcript += word["word"]
                    elif (word["start"] + word["end"]) / 2 < prompt_end_time:
                        # include part of the word it it's big, but adjust prompt_end_time
-                        target_transcript += word["word"] + (" " if word["word"][-1] != " " else "")
+                        target_transcript += word["word"]
                        prompt_end_time = word["end"]
                        break
                    else:
@ -263,13 +265,13 @@ def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
                target_transcript = ""
                for word in transcribe_state["words_info"]:
                    if word["start"] < edit_start_time:
-                        target_transcript += word["word"] + (" " if word["word"][-1] != " " else "")
+                        target_transcript += word["word"]
                    else:
                        break
                target_transcript += f" {sentence}"
                for word in transcribe_state["words_info"]:
                    if word["end"] > edit_end_time:
-                        target_transcript += word["word"] + (" " if word["word"][-1] != " " else "")
+                        target_transcript += word["word"]
            else:
                target_transcript = sentence

@ -441,7 +443,7 @@ with gr.Blocks() as app:

    with gr.Row():
        with gr.Column(scale=2):
-            input_audio = gr.Audio(value="./demo/84_121550_000074_000000.wav", label="Input Audio", type="filepath", interactive=True)
+            input_audio = gr.Audio(value="./demo/84_121550_000074_000000.wav", label="Input Audio", type="filepath")
            with gr.Group():
                original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
                                                 info="Use whisper model to get the transcript. Fix and align it if necessary.")
@ -494,22 +496,22 @@ with gr.Blocks() as app:
                rerun_btn = gr.Button(value="Rerun")

    with gr.Row():
-        with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
-            stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=3,
-                                       info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
-            sample_batch_size = gr.Number(label="speech rate", value=4, precision=0,
-                                          info="The higher the number, the faster the output will be. Under the hood, the model will generate this many samples and choose the shortest one")
-            seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
+        with gr.Accordion("VoiceCraft config", open=False):
+            seed = gr.Number(label="seed", value=-1, precision=0)
+            left_margin = gr.Number(label="left_margin", value=0.08)
+            right_margin = gr.Number(label="right_margin", value=0.08)
+            codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000)
+            codec_sr = gr.Number(label="codec_sr", value=50)
+            top_k = gr.Number(label="top_k", value=0)
+            top_p = gr.Number(label="top_p", value=0.8)
+            temperature = gr.Number(label="temperature", value=1)
+            stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3], value=3,
+                                       info="if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1, -1 = disabled")
+            sample_batch_size = gr.Number(label="sample_batch_size", value=4, precision=0,
+                                          info="generate this many samples and choose the shortest one")
            kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
                                info="set to 0 to use less VRAM, but with slower inference")
-            left_margin = gr.Number(label="left_margin", value=0.08, info="margin to the left of the editing segment")
-            right_margin = gr.Number(label="right_margin", value=0.08, info="margin to the right of the editing segment")
-            top_p = gr.Number(label="top_p", value=0.8, info="0.8 is a good value, 0.9 is also good")
-            temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
-            top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
-            codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
-            codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
-            silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
+            silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]")

    
    audio_tensors = gr.State()