mirror of
https://github.com/jasonppy/VoiceCraft.git
synced 2025-06-05 21:49:11 +02:00
essential gradio app args added, colab notebook fix
This commit is contained in:
16
README.md
16
README.md
@@ -1,9 +1,3 @@
|
|||||||
# VoiceCraft Gradio Colab
|
|
||||||
|
|
||||||
[](https://colab.research.google.com/github/Sewlell/VoiceCraft-gradio-colab/blob/master/voicecraft.ipynb)
|
|
||||||
|
|
||||||
Made for those who lacked a dedicated GPU and those who wanted [the friendly GUI by @zuev-stepan](https://github.com/zuev-stepan/VoiceCraft-gradio). Potato programmer brain here so all code credits to @jasonppy, @zuev-stepan and others who contributed in their code.
|
|
||||||
|
|
||||||
# VoiceCraft: Zero-Shot Speech Editing and Text-to-Speech in the Wild
|
# VoiceCraft: Zero-Shot Speech Editing and Text-to-Speech in the Wild
|
||||||
[Demo](https://jasonppy.github.io/VoiceCraft_web) [Paper](https://jasonppy.github.io/assets/pdfs/VoiceCraft.pdf)
|
[Demo](https://jasonppy.github.io/VoiceCraft_web) [Paper](https://jasonppy.github.io/assets/pdfs/VoiceCraft.pdf)
|
||||||
|
|
||||||
@@ -105,10 +99,6 @@ conda install -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=
|
|||||||
|
|
||||||
# to run ipynb
|
# to run ipynb
|
||||||
conda install -n voicecraft ipykernel --no-deps --force-reinstall
|
conda install -n voicecraft ipykernel --no-deps --force-reinstall
|
||||||
|
|
||||||
# below is only needed if you want to run gradio_app.py
|
|
||||||
sudo apt-get install espeak # NOTE: only required if you want to use gradio_app, which is used by whisperx for forced alignment
|
|
||||||
sudo apt-get install libespeak-dev # NOTE: only required if you want to use gradio_app, which is used by whisperx for forced alignment
|
|
||||||
```
|
```
|
||||||
|
|
||||||
If you have encountered version issues when running things, checkout [environment.yml](./environment.yml) for exact matching.
|
If you have encountered version issues when running things, checkout [environment.yml](./environment.yml) for exact matching.
|
||||||
@@ -117,6 +107,11 @@ If you have encountered version issues when running things, checkout [environmen
|
|||||||
Checkout [`inference_speech_editing.ipynb`](./inference_speech_editing.ipynb) and [`inference_tts.ipynb`](./inference_tts.ipynb)
|
Checkout [`inference_speech_editing.ipynb`](./inference_speech_editing.ipynb) and [`inference_tts.ipynb`](./inference_tts.ipynb)
|
||||||
|
|
||||||
## Gradio
|
## Gradio
|
||||||
|
### Run in colab
|
||||||
|
|
||||||
|
[](https://colab.research.google.com/github/zuev-stepan/VoiceCraft-gradio/blob/feature/colab-notebook/voicecraft-gradio-colab.ipynb)
|
||||||
|
|
||||||
|
### Run locally
|
||||||
After environment setup install additional dependencies:
|
After environment setup install additional dependencies:
|
||||||
```bash
|
```bash
|
||||||
apt-get install -y espeak espeak-data libespeak1 libespeak-dev
|
apt-get install -y espeak espeak-data libespeak1 libespeak-dev
|
||||||
@@ -130,7 +125,6 @@ pip install -r gradio_requirements.txt
|
|||||||
Run gradio server from terminal or [`gradio_app.ipynb`](./gradio_app.ipynb):
|
Run gradio server from terminal or [`gradio_app.ipynb`](./gradio_app.ipynb):
|
||||||
```bash
|
```bash
|
||||||
python gradio_app.py
|
python gradio_app.py
|
||||||
TMP_PATH=/tmp python gradio_app.py # if you want to change tmp folder path
|
|
||||||
```
|
```
|
||||||
It is ready to use on [default url](http://127.0.0.1:7860).
|
It is ready to use on [default url](http://127.0.0.1:7860).
|
||||||
|
|
||||||
|
313
gradio_app.py
313
gradio_app.py
@@ -12,12 +12,10 @@ import numpy as np
|
|||||||
import random
|
import random
|
||||||
import uuid
|
import uuid
|
||||||
|
|
||||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
|
||||||
os.chdir("/content/VoiceCraft-gradio-colab")
|
|
||||||
os.environ['USER'] = 'aaa'
|
|
||||||
|
|
||||||
|
DEMO_PATH = os.getenv("DEMO_PATH", ".demo")
|
||||||
TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
|
TMP_PATH = os.getenv("TMP_PATH", "./demo/temp")
|
||||||
|
MODELS_PATH = os.getenv("MODELS_PATH", "./pretrained_models")
|
||||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
whisper_model, align_model, voicecraft_model = None, None, None
|
whisper_model, align_model, voicecraft_model = None, None, None
|
||||||
|
|
||||||
@@ -94,14 +92,14 @@ def load_models(whisper_backend_name, whisper_model_name, alignment_model_name,
|
|||||||
transcribe_model = WhisperxModel(whisper_model_name, align_model)
|
transcribe_model = WhisperxModel(whisper_model_name, align_model)
|
||||||
|
|
||||||
voicecraft_name = f"{voicecraft_model_name}.pth"
|
voicecraft_name = f"{voicecraft_model_name}.pth"
|
||||||
ckpt_fn = f"./pretrained_models/{voicecraft_name}"
|
ckpt_fn = f"{MODELS_PATH}/{voicecraft_name}"
|
||||||
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
|
encodec_fn = f"{MODELS_PATH}/encodec_4cb2048_giga.th"
|
||||||
if not os.path.exists(ckpt_fn):
|
if not os.path.exists(ckpt_fn):
|
||||||
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true")
|
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true")
|
||||||
os.system(f"mv {voicecraft_name}\?download\=true ./pretrained_models/{voicecraft_name}")
|
os.system(f"mv {voicecraft_name}\?download\=true {MODELS_PATH}/{voicecraft_name}")
|
||||||
if not os.path.exists(encodec_fn):
|
if not os.path.exists(encodec_fn):
|
||||||
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
|
os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
|
||||||
os.system(f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")
|
os.system(f"mv encodec_4cb2048_giga.th {MODELS_PATH}/encodec_4cb2048_giga.th")
|
||||||
|
|
||||||
ckpt = torch.load(ckpt_fn, map_location="cpu")
|
ckpt = torch.load(ckpt_fn, map_location="cpu")
|
||||||
model = voicecraft.VoiceCraft(ckpt["config"])
|
model = voicecraft.VoiceCraft(ckpt["config"])
|
||||||
@@ -431,146 +429,131 @@ def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_wo
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
with gr.Blocks() as app:
|
def get_app():
|
||||||
with gr.Row():
|
with gr.Blocks() as app:
|
||||||
with gr.Column(scale=2):
|
with gr.Row():
|
||||||
load_models_btn = gr.Button(value="Load models")
|
with gr.Column(scale=2):
|
||||||
with gr.Column(scale=5):
|
load_models_btn = gr.Button(value="Load models")
|
||||||
with gr.Accordion("Select models", open=False) as models_selector:
|
with gr.Column(scale=5):
|
||||||
with gr.Row():
|
with gr.Accordion("Select models", open=False) as models_selector:
|
||||||
voicecraft_model_choice = gr.Radio(label="VoiceCraft model", value="giga830M",
|
|
||||||
choices=["giga330M", "giga830M", "giga330M_TTSEnhanced"])
|
|
||||||
whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisper", "whisperX"])
|
|
||||||
whisper_model_choice = gr.Radio(label="Whisper model", value="base.en",
|
|
||||||
choices=[None, "base.en", "small.en", "medium.en", "large"])
|
|
||||||
align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=[None, "whisperX"])
|
|
||||||
|
|
||||||
with gr.Row():
|
|
||||||
with gr.Column(scale=2):
|
|
||||||
input_audio = gr.Audio(value="./demo/84_121550_000074_000000.wav", label="Input Audio", type="filepath", interactive=True)
|
|
||||||
with gr.Group():
|
|
||||||
original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
|
|
||||||
info="Use whisper model to get the transcript. Fix and align it if necessary.")
|
|
||||||
with gr.Accordion("Word start time", open=False):
|
|
||||||
transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
|
|
||||||
with gr.Accordion("Word end time", open=False):
|
|
||||||
transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
|
|
||||||
|
|
||||||
transcribe_btn = gr.Button(value="Transcribe")
|
|
||||||
align_btn = gr.Button(value="Align")
|
|
||||||
|
|
||||||
with gr.Column(scale=3):
|
|
||||||
with gr.Group():
|
|
||||||
transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
|
|
||||||
with gr.Row():
|
|
||||||
smart_transcript = gr.Checkbox(label="Smart transcript", value=True)
|
|
||||||
with gr.Accordion(label="?", open=False):
|
|
||||||
info = gr.Markdown(value=smart_transcript_info)
|
|
||||||
|
|
||||||
with gr.Row():
|
|
||||||
mode = gr.Radio(label="Mode", choices=["TTS", "Edit", "Long TTS"], value="TTS")
|
|
||||||
split_text = gr.Radio(label="Split text", choices=["Newline", "Sentence"], value="Newline",
|
|
||||||
info="Split text into parts and run TTS for each part.", visible=False)
|
|
||||||
edit_word_mode = gr.Radio(label="Edit word mode", choices=["Replace half", "Replace all"], value="Replace half",
|
|
||||||
info="What to do with first and last word", visible=False)
|
|
||||||
|
|
||||||
with gr.Group() as tts_mode_controls:
|
|
||||||
prompt_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[10], interactive=True)
|
|
||||||
prompt_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.93, step=0.001, value=3.016)
|
|
||||||
|
|
||||||
with gr.Group(visible=False) as edit_mode_controls:
|
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[2], interactive=True)
|
voicecraft_model_choice = gr.Radio(label="VoiceCraft model", value="giga830M",
|
||||||
edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[12], interactive=True)
|
choices=["giga330M", "giga830M", "giga330M_TTSEnhanced"])
|
||||||
|
whisper_backend_choice = gr.Radio(label="Whisper backend", value="whisperX", choices=["whisper", "whisperX"])
|
||||||
|
whisper_model_choice = gr.Radio(label="Whisper model", value="base.en",
|
||||||
|
choices=[None, "base.en", "small.en", "medium.en", "large"])
|
||||||
|
align_model_choice = gr.Radio(label="Forced alignment model", value="whisperX", choices=[None, "whisperX"])
|
||||||
|
|
||||||
|
with gr.Row():
|
||||||
|
with gr.Column(scale=2):
|
||||||
|
input_audio = gr.Audio(value=f"{DEMO_PATH}/84_121550_000074_000000.wav", label="Input Audio", type="filepath", interactive=True)
|
||||||
|
with gr.Group():
|
||||||
|
original_transcript = gr.Textbox(label="Original transcript", lines=5, value=demo_original_transcript,
|
||||||
|
info="Use whisper model to get the transcript. Fix and align it if necessary.")
|
||||||
|
with gr.Accordion("Word start time", open=False):
|
||||||
|
transcript_with_start_time = gr.Textbox(label="Start time", lines=5, interactive=False, info="Start time before each word")
|
||||||
|
with gr.Accordion("Word end time", open=False):
|
||||||
|
transcript_with_end_time = gr.Textbox(label="End time", lines=5, interactive=False, info="End time after each word")
|
||||||
|
|
||||||
|
transcribe_btn = gr.Button(value="Transcribe")
|
||||||
|
align_btn = gr.Button(value="Align")
|
||||||
|
|
||||||
|
with gr.Column(scale=3):
|
||||||
|
with gr.Group():
|
||||||
|
transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.93, step=0.001, value=0.46)
|
smart_transcript = gr.Checkbox(label="Smart transcript", value=True)
|
||||||
edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.93, step=0.001, value=3.808)
|
with gr.Accordion(label="?", open=False):
|
||||||
|
info = gr.Markdown(value=smart_transcript_info)
|
||||||
|
|
||||||
run_btn = gr.Button(value="Run")
|
with gr.Row():
|
||||||
|
mode = gr.Radio(label="Mode", choices=["TTS", "Edit", "Long TTS"], value="TTS")
|
||||||
|
split_text = gr.Radio(label="Split text", choices=["Newline", "Sentence"], value="Newline",
|
||||||
|
info="Split text into parts and run TTS for each part.", visible=False)
|
||||||
|
edit_word_mode = gr.Radio(label="Edit word mode", choices=["Replace half", "Replace all"], value="Replace half",
|
||||||
|
info="What to do with first and last word", visible=False)
|
||||||
|
|
||||||
with gr.Column(scale=2):
|
with gr.Group() as tts_mode_controls:
|
||||||
output_audio = gr.Audio(label="Output Audio")
|
prompt_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[10], interactive=True)
|
||||||
with gr.Accordion("Inference transcript", open=False):
|
prompt_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.93, step=0.001, value=3.016)
|
||||||
inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
|
|
||||||
info="Inference was performed on this transcript.")
|
|
||||||
with gr.Group(visible=False) as long_tts_sentence_editor:
|
|
||||||
sentence_selector = gr.Dropdown(label="Sentence", value=None,
|
|
||||||
info="Select sentence you want to regenerate")
|
|
||||||
sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
|
|
||||||
rerun_btn = gr.Button(value="Rerun")
|
|
||||||
|
|
||||||
with gr.Row():
|
with gr.Group(visible=False) as edit_mode_controls:
|
||||||
with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
|
with gr.Row():
|
||||||
stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=3,
|
edit_from_word = gr.Dropdown(label="First word to edit", choices=demo_words, value=demo_words[2], interactive=True)
|
||||||
info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
|
edit_to_word = gr.Dropdown(label="Last word to edit", choices=demo_words, value=demo_words[12], interactive=True)
|
||||||
sample_batch_size = gr.Number(label="speech rate", value=4, precision=0,
|
with gr.Row():
|
||||||
info="The higher the number, the faster the output will be. "
|
edit_start_time = gr.Slider(label="Edit from time", minimum=0, maximum=7.93, step=0.001, value=0.46)
|
||||||
"Under the hood, the model will generate this many samples and choose the shortest one. "
|
edit_end_time = gr.Slider(label="Edit to time", minimum=0, maximum=7.93, step=0.001, value=3.808)
|
||||||
"For giga330M_TTSEnhanced, 1 or 2 should be fine since the model is trained to do TTS.")
|
|
||||||
seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
|
run_btn = gr.Button(value="Run")
|
||||||
kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
|
|
||||||
info="set to 0 to use less VRAM, but with slower inference")
|
with gr.Column(scale=2):
|
||||||
left_margin = gr.Number(label="left_margin", value=0.08, info="margin to the left of the editing segment")
|
output_audio = gr.Audio(label="Output Audio")
|
||||||
right_margin = gr.Number(label="right_margin", value=0.08, info="margin to the right of the editing segment")
|
with gr.Accordion("Inference transcript", open=False):
|
||||||
top_p = gr.Number(label="top_p", value=0.9, info="0.9 is a good value, 0.8 is also good")
|
inference_transcript = gr.Textbox(label="Inference transcript", lines=5, interactive=False,
|
||||||
temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
|
info="Inference was performed on this transcript.")
|
||||||
top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
|
with gr.Group(visible=False) as long_tts_sentence_editor:
|
||||||
codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
|
sentence_selector = gr.Dropdown(label="Sentence", value=None,
|
||||||
codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
|
info="Select sentence you want to regenerate")
|
||||||
silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
|
sentence_audio = gr.Audio(label="Sentence Audio", scale=2)
|
||||||
|
rerun_btn = gr.Button(value="Rerun")
|
||||||
|
|
||||||
|
with gr.Row():
|
||||||
|
with gr.Accordion("Generation Parameters - change these if you are unhappy with the generation", open=False):
|
||||||
|
stop_repetition = gr.Radio(label="stop_repetition", choices=[-1, 1, 2, 3, 4], value=3,
|
||||||
|
info="if there are long silence in the generated audio, reduce the stop_repetition to 2 or 1. -1 = disabled")
|
||||||
|
sample_batch_size = gr.Number(label="speech rate", value=4, precision=0,
|
||||||
|
info="The higher the number, the faster the output will be. "
|
||||||
|
"Under the hood, the model will generate this many samples and choose the shortest one. "
|
||||||
|
"For giga330M_TTSEnhanced, 1 or 2 should be fine since the model is trained to do TTS.")
|
||||||
|
seed = gr.Number(label="seed", value=-1, precision=0, info="random seeds always works :)")
|
||||||
|
kvcache = gr.Radio(label="kvcache", choices=[0, 1], value=1,
|
||||||
|
info="set to 0 to use less VRAM, but with slower inference")
|
||||||
|
left_margin = gr.Number(label="left_margin", value=0.08, info="margin to the left of the editing segment")
|
||||||
|
right_margin = gr.Number(label="right_margin", value=0.08, info="margin to the right of the editing segment")
|
||||||
|
top_p = gr.Number(label="top_p", value=0.9, info="0.9 is a good value, 0.8 is also good")
|
||||||
|
temperature = gr.Number(label="temperature", value=1, info="haven't try other values, do not recommend to change")
|
||||||
|
top_k = gr.Number(label="top_k", value=0, info="0 means we don't use topk sampling, because we use topp sampling")
|
||||||
|
codec_audio_sr = gr.Number(label="codec_audio_sr", value=16000, info='encodec specific, Do not change')
|
||||||
|
codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
|
||||||
|
silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
|
||||||
|
|
||||||
|
|
||||||
audio_tensors = gr.State()
|
audio_tensors = gr.State()
|
||||||
transcribe_state = gr.State(value={"words_info": demo_words_info})
|
transcribe_state = gr.State(value={"words_info": demo_words_info})
|
||||||
|
|
||||||
|
|
||||||
mode.change(fn=update_demo,
|
mode.change(fn=update_demo,
|
||||||
inputs=[mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word],
|
inputs=[mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word],
|
||||||
outputs=[transcript, edit_from_word, edit_to_word])
|
outputs=[transcript, edit_from_word, edit_to_word])
|
||||||
edit_word_mode.change(fn=update_demo,
|
edit_word_mode.change(fn=update_demo,
|
||||||
inputs=[mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word],
|
|
||||||
outputs=[transcript, edit_from_word, edit_to_word])
|
|
||||||
smart_transcript.change(fn=update_demo,
|
|
||||||
inputs=[mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word],
|
inputs=[mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word],
|
||||||
outputs=[transcript, edit_from_word, edit_to_word])
|
outputs=[transcript, edit_from_word, edit_to_word])
|
||||||
|
smart_transcript.change(fn=update_demo,
|
||||||
|
inputs=[mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word],
|
||||||
|
outputs=[transcript, edit_from_word, edit_to_word])
|
||||||
|
|
||||||
load_models_btn.click(fn=load_models,
|
load_models_btn.click(fn=load_models,
|
||||||
inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, voicecraft_model_choice],
|
inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, voicecraft_model_choice],
|
||||||
outputs=[models_selector])
|
outputs=[models_selector])
|
||||||
|
|
||||||
input_audio.upload(fn=update_input_audio,
|
input_audio.upload(fn=update_input_audio,
|
||||||
inputs=[input_audio],
|
inputs=[input_audio],
|
||||||
outputs=[prompt_end_time, edit_start_time, edit_end_time])
|
outputs=[prompt_end_time, edit_start_time, edit_end_time])
|
||||||
transcribe_btn.click(fn=transcribe,
|
transcribe_btn.click(fn=transcribe,
|
||||||
inputs=[seed, input_audio],
|
inputs=[seed, input_audio],
|
||||||
outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time,
|
outputs=[original_transcript, transcript_with_start_time, transcript_with_end_time,
|
||||||
prompt_to_word, edit_from_word, edit_to_word, transcribe_state])
|
prompt_to_word, edit_from_word, edit_to_word, transcribe_state])
|
||||||
align_btn.click(fn=align,
|
align_btn.click(fn=align,
|
||||||
inputs=[seed, original_transcript, input_audio],
|
inputs=[seed, original_transcript, input_audio],
|
||||||
outputs=[transcript_with_start_time, transcript_with_end_time,
|
outputs=[transcript_with_start_time, transcript_with_end_time,
|
||||||
prompt_to_word, edit_from_word, edit_to_word, transcribe_state])
|
prompt_to_word, edit_from_word, edit_to_word, transcribe_state])
|
||||||
|
|
||||||
mode.change(fn=change_mode,
|
mode.change(fn=change_mode,
|
||||||
inputs=[mode],
|
inputs=[mode],
|
||||||
outputs=[tts_mode_controls, edit_mode_controls, edit_word_mode, split_text, long_tts_sentence_editor])
|
outputs=[tts_mode_controls, edit_mode_controls, edit_word_mode, split_text, long_tts_sentence_editor])
|
||||||
|
|
||||||
run_btn.click(fn=run,
|
run_btn.click(fn=run,
|
||||||
inputs=[
|
|
||||||
seed, left_margin, right_margin,
|
|
||||||
codec_audio_sr, codec_sr,
|
|
||||||
top_k, top_p, temperature,
|
|
||||||
stop_repetition, sample_batch_size,
|
|
||||||
kvcache, silence_tokens,
|
|
||||||
input_audio, transcribe_state, transcript, smart_transcript,
|
|
||||||
mode, prompt_end_time, edit_start_time, edit_end_time,
|
|
||||||
split_text, sentence_selector, audio_tensors
|
|
||||||
],
|
|
||||||
outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
|
|
||||||
|
|
||||||
sentence_selector.change(fn=load_sentence,
|
|
||||||
inputs=[sentence_selector, codec_audio_sr, audio_tensors],
|
|
||||||
outputs=[sentence_audio])
|
|
||||||
rerun_btn.click(fn=run,
|
|
||||||
inputs=[
|
inputs=[
|
||||||
seed, left_margin, right_margin,
|
seed, left_margin, right_margin,
|
||||||
codec_audio_sr, codec_sr,
|
codec_audio_sr, codec_sr,
|
||||||
@@ -578,24 +561,58 @@ with gr.Blocks() as app:
|
|||||||
stop_repetition, sample_batch_size,
|
stop_repetition, sample_batch_size,
|
||||||
kvcache, silence_tokens,
|
kvcache, silence_tokens,
|
||||||
input_audio, transcribe_state, transcript, smart_transcript,
|
input_audio, transcribe_state, transcript, smart_transcript,
|
||||||
gr.State(value="Rerun"), prompt_end_time, edit_start_time, edit_end_time,
|
mode, prompt_end_time, edit_start_time, edit_end_time,
|
||||||
split_text, sentence_selector, audio_tensors
|
split_text, sentence_selector, audio_tensors
|
||||||
],
|
],
|
||||||
outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
|
outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
|
||||||
|
|
||||||
prompt_to_word.change(fn=update_bound_word,
|
sentence_selector.change(fn=load_sentence,
|
||||||
inputs=[gr.State(False), prompt_to_word, gr.State("Replace all")],
|
inputs=[sentence_selector, codec_audio_sr, audio_tensors],
|
||||||
outputs=[prompt_end_time])
|
outputs=[sentence_audio])
|
||||||
edit_from_word.change(fn=update_bound_word,
|
rerun_btn.click(fn=run,
|
||||||
inputs=[gr.State(True), edit_from_word, edit_word_mode],
|
inputs=[
|
||||||
outputs=[edit_start_time])
|
seed, left_margin, right_margin,
|
||||||
edit_to_word.change(fn=update_bound_word,
|
codec_audio_sr, codec_sr,
|
||||||
inputs=[gr.State(False), edit_to_word, edit_word_mode],
|
top_k, top_p, temperature,
|
||||||
outputs=[edit_end_time])
|
stop_repetition, sample_batch_size,
|
||||||
edit_word_mode.change(fn=update_bound_words,
|
kvcache, silence_tokens,
|
||||||
inputs=[edit_from_word, edit_to_word, edit_word_mode],
|
input_audio, transcribe_state, transcript, smart_transcript,
|
||||||
outputs=[edit_start_time, edit_end_time])
|
gr.State(value="Rerun"), prompt_end_time, edit_start_time, edit_end_time,
|
||||||
|
split_text, sentence_selector, audio_tensors
|
||||||
|
],
|
||||||
|
outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
|
||||||
|
|
||||||
|
prompt_to_word.change(fn=update_bound_word,
|
||||||
|
inputs=[gr.State(False), prompt_to_word, gr.State("Replace all")],
|
||||||
|
outputs=[prompt_end_time])
|
||||||
|
edit_from_word.change(fn=update_bound_word,
|
||||||
|
inputs=[gr.State(True), edit_from_word, edit_word_mode],
|
||||||
|
outputs=[edit_start_time])
|
||||||
|
edit_to_word.change(fn=update_bound_word,
|
||||||
|
inputs=[gr.State(False), edit_to_word, edit_word_mode],
|
||||||
|
outputs=[edit_end_time])
|
||||||
|
edit_word_mode.change(fn=update_bound_words,
|
||||||
|
inputs=[edit_from_word, edit_to_word, edit_word_mode],
|
||||||
|
outputs=[edit_start_time, edit_end_time])
|
||||||
|
return app
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.launch(share=True)
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="VoiceCraft gradio app.")
|
||||||
|
|
||||||
|
parser.add_argument("--demo-path", default=".demo", help="Path to demo directory")
|
||||||
|
parser.add_argument("--tmp-path", default=".demo/temp", help="Path to tmp directory")
|
||||||
|
parser.add_argument("--models-path", default=".pretrained_models", help="Path to voicecraft models directory")
|
||||||
|
parser.add_argument("--port", default=7860, type=int, help="App port")
|
||||||
|
parser.add_argument("--share", action="store_true", help="Launch with public url")
|
||||||
|
|
||||||
|
os.environ["USER"] = os.getenv("USER", "user")
|
||||||
|
args = parser.parse_args()
|
||||||
|
DEMO_PATH = args.demo_path
|
||||||
|
TMP_PATH = args.tmp_path
|
||||||
|
MODELS_PATH = args.models_path
|
||||||
|
|
||||||
|
app = get_app()
|
||||||
|
app.launch(share=args.share, server_port=args.port)
|
||||||
|
@@ -1,28 +1,10 @@
|
|||||||
{
|
{
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 0,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"provenance": [],
|
|
||||||
"gpuType": "T4",
|
|
||||||
"authorship_tag": "ABX9TyPsqFhtOeQ18CXHnRkWAQSk",
|
|
||||||
"include_colab_link": true
|
|
||||||
},
|
|
||||||
"kernelspec": {
|
|
||||||
"name": "python3",
|
|
||||||
"display_name": "Python 3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"name": "python"
|
|
||||||
},
|
|
||||||
"accelerator": "GPU"
|
|
||||||
},
|
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "view-in-github",
|
"colab_type": "text",
|
||||||
"colab_type": "text"
|
"id": "view-in-github"
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"<a href=\"https://colab.research.google.com/github/Sewlell/VoiceCraft-gradio-colab/blob/master/voicecraft.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
"<a href=\"https://colab.research.google.com/github/Sewlell/VoiceCraft-gradio-colab/blob/master/voicecraft.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||||
@@ -36,11 +18,16 @@
|
|||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"!git clone https://github.com/Sewlell/VoiceCraft-gradio-colab"
|
"!git clone https://github.com/zuev-stepan/VoiceCraft-gradio"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "-w3USR91XdxY"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"!pip install tensorboard\n",
|
"!pip install tensorboard\n",
|
||||||
"!pip install phonemizer\n",
|
"!pip install phonemizer\n",
|
||||||
@@ -55,25 +42,23 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"!pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft\n",
|
"!pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft\n",
|
||||||
"\n",
|
"\n",
|
||||||
"!pip install -r \"/content/VoiceCraft-gradio-colab/gradio_requirements.txt\""
|
"!pip install -r \"/content/VoiceCraft-gradio/gradio_requirements.txt\""
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "-w3USR91XdxY"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"# Let it restarted, it won't let your entire installation be aborted."
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "jNuzjrtmv2n1"
|
"id": "jNuzjrtmv2n1"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"# Let it restarted, it won't let your entire installation be aborted."
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "AnqGEwZ4NxtJ"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# Note before launching the `gradio_app.py`\n",
|
"# Note before launching the `gradio_app.py`\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -83,45 +68,58 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"Frequency of VRAM spikes no longer exist as well in April 5 Update.\n",
|
"Frequency of VRAM spikes no longer exist as well in April 5 Update.\n",
|
||||||
"* Nevermind, I have observed some weird usage on Colab's GPU Memory Monitor. It can spike up to 13.5GB VRAM even in WhisperX mode. (April 11)"
|
"* Nevermind, I have observed some weird usage on Colab's GPU Memory Monitor. It can spike up to 13.5GB VRAM even in WhisperX mode. (April 11)"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "AnqGEwZ4NxtJ"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "dE0W76cMN3Si"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"Don't make your `prompt end time` too long, 6-9s is fine. Or else it will **either raise up JSON issue or cut off your generated audio**. This one is due to how VoiceCraft worked (so probably unfixable). It will add those text you want to get audio from at the end of the input audio transcript. It was way too much word for application or code to handle as it added up with original transcript. So please keep it short.\n",
|
"Don't make your `prompt end time` too long, 6-9s is fine. Or else it will **either raise up JSON issue or cut off your generated audio**. This one is due to how VoiceCraft worked (so probably unfixable). It will add those text you want to get audio from at the end of the input audio transcript. It was way too much word for application or code to handle as it added up with original transcript. So please keep it short.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Your total audio length (`prompt end time` + add-up audio) must not exceed 16 or 17s."
|
"Your total audio length (`prompt end time` + add-up audio) must not exceed 16 or 17s."
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "dE0W76cMN3Si"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "nnu2cY4t8P6X"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"For voice cloning, I suggest you to probably have a monotone input to feed the voice cloning. Of course you can always try input that have tons of tone variety, but I find that as per April 11 Update, it's much more easy to replicate in monotone rather than audio that have laugh, scream, crying inside.\n",
|
"For voice cloning, I suggest you to probably have a monotone input to feed the voice cloning. Of course you can always try input that have tons of tone variety, but I find that as per April 11 Update, it's much more easy to replicate in monotone rather than audio that have laugh, scream, crying inside.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"The inference speed is much stable. With sample text, T4 (Free Tier Colab GPU) can do 6-15s on 6s-8s of `prompt end time`.\n",
|
"The inference speed is much stable. With sample text, T4 (Free Tier Colab GPU) can do 6-15s on 6s-8s of `prompt end time`."
|
||||||
"\n",
|
]
|
||||||
"I haven't test the Edit mode yet as those are not my focus, but you can try it."
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"id": "nnu2cY4t8P6X"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": null,
|
||||||
"!python \"/content/VoiceCraft-gradio-colab/gradio_app.py\""
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "NDt4r4DiXAwG"
|
"id": "NDt4r4DiXAwG"
|
||||||
},
|
},
|
||||||
"execution_count": null,
|
"outputs": [],
|
||||||
"outputs": []
|
"source": [
|
||||||
|
"!python /content/VoiceCraft-gradio/gradio_app.py --demo-path=/content/VoiceCraft-gradio/demo --tmp-path=/content/VoiceCraft-gradio/demo/temp --models-path=/content/VoiceCraft-gradio/pretrained_models --share"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
|
"metadata": {
|
||||||
|
"accelerator": "GPU",
|
||||||
|
"colab": {
|
||||||
|
"authorship_tag": "ABX9TyPsqFhtOeQ18CXHnRkWAQSk",
|
||||||
|
"gpuType": "T4",
|
||||||
|
"include_colab_link": true,
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
}
|
}
|
Reference in New Issue
Block a user