remove trailing whitespace

This commit is contained in:
pgosar 2024-04-06 16:42:50 -05:00
parent 9ce26becea
commit 9148e6020e
1 changed files with 17 additions and 17 deletions

View File

@ -90,7 +90,7 @@ def load_models(whisper_backend_name, whisper_model_name, alignment_model_name,
if align_model is None: if align_model is None:
raise gr.Error("Align model required for whisperx backend") raise gr.Error("Align model required for whisperx backend")
transcribe_model = WhisperxModel(whisper_model_name, align_model) transcribe_model = WhisperxModel(whisper_model_name, align_model)
voicecraft_name = f"{voicecraft_model_name}.pth" voicecraft_name = f"{voicecraft_model_name}.pth"
ckpt_fn = f"./pretrained_models/{voicecraft_name}" ckpt_fn = f"./pretrained_models/{voicecraft_name}"
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th" encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
@ -132,7 +132,7 @@ def transcribe(seed, audio_path):
if transcribe_model is None: if transcribe_model is None:
raise gr.Error("Transcription model not loaded") raise gr.Error("Transcription model not loaded")
seed_everything(seed) seed_everything(seed)
segments = transcribe_model.transcribe(audio_path) segments = transcribe_model.transcribe(audio_path)
state = get_transcribe_state(segments) state = get_transcribe_state(segments)
@ -234,7 +234,7 @@ def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
if mode != "Edit": if mode != "Edit":
from inference_tts_scale import inference_one_sample from inference_tts_scale import inference_one_sample
if smart_transcript: if smart_transcript:
target_transcript = "" target_transcript = ""
for word in transcribe_state["words_info"]: for word in transcribe_state["words_info"]:
if word["end"] < prompt_end_time: if word["end"] < prompt_end_time:
@ -281,7 +281,7 @@ def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
morphed_span = (max(edit_start_time - left_margin, 1 / codec_sr), min(edit_end_time + right_margin, audio_dur)) morphed_span = (max(edit_start_time - left_margin, 1 / codec_sr), min(edit_end_time + right_margin, audio_dur))
mask_interval = [[round(morphed_span[0]*codec_sr), round(morphed_span[1]*codec_sr)]] mask_interval = [[round(morphed_span[0]*codec_sr), round(morphed_span[1]*codec_sr)]]
mask_interval = torch.LongTensor(mask_interval) mask_interval = torch.LongTensor(mask_interval)
_, gen_audio = inference_one_sample(voicecraft_model["model"], _, gen_audio = inference_one_sample(voicecraft_model["model"],
voicecraft_model["ckpt"]["config"], voicecraft_model["ckpt"]["config"],
voicecraft_model["ckpt"]["phn2num"], voicecraft_model["ckpt"]["phn2num"],
@ -300,12 +300,12 @@ def run(seed, left_margin, right_margin, codec_audio_sr, codec_sr, top_k, top_p,
output_audio = get_output_audio(previous_audio_tensors, codec_audio_sr) output_audio = get_output_audio(previous_audio_tensors, codec_audio_sr)
sentence_audio = get_output_audio(audio_tensors, codec_audio_sr) sentence_audio = get_output_audio(audio_tensors, codec_audio_sr)
return output_audio, inference_transcript, sentence_audio, previous_audio_tensors return output_audio, inference_transcript, sentence_audio, previous_audio_tensors
def update_input_audio(audio_path): def update_input_audio(audio_path):
if audio_path is None: if audio_path is None:
return 0, 0, 0 return 0, 0, 0
info = torchaudio.info(audio_path) info = torchaudio.info(audio_path)
max_time = round(info.num_frames / info.sample_rate, 2) max_time = round(info.num_frames / info.sample_rate, 2)
return [ return [
@ -314,7 +314,7 @@ def update_input_audio(audio_path):
gr.Slider(maximum=max_time, value=max_time), gr.Slider(maximum=max_time, value=max_time),
] ]
def change_mode(mode): def change_mode(mode):
tts_mode_controls, edit_mode_controls, edit_word_mode, split_text, long_tts_sentence_editor tts_mode_controls, edit_mode_controls, edit_word_mode, split_text, long_tts_sentence_editor
return [ return [
@ -416,7 +416,7 @@ demo_words_info = [
def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word): def update_demo(mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word):
if transcript not in all_demo_texts: if transcript not in all_demo_texts:
return transcript, edit_from_word, edit_to_word return transcript, edit_from_word, edit_to_word
replace_half = edit_word_mode == "Replace half" replace_half = edit_word_mode == "Replace half"
change_edit_from_word = edit_from_word == demo_words[2] or edit_from_word == demo_words[3] change_edit_from_word = edit_from_word == demo_words[2] or edit_from_word == demo_words[3]
change_edit_to_word = edit_to_word == demo_words[11] or edit_to_word == demo_words[12] change_edit_to_word = edit_to_word == demo_words[11] or edit_to_word == demo_words[12]
@ -456,7 +456,7 @@ with gr.Blocks() as app:
transcribe_btn = gr.Button(value="Transcribe") transcribe_btn = gr.Button(value="Transcribe")
align_btn = gr.Button(value="Align") align_btn = gr.Button(value="Align")
with gr.Column(scale=3): with gr.Column(scale=3):
with gr.Group(): with gr.Group():
transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"]) transcript = gr.Textbox(label="Text", lines=7, value=demo_text["TTS"]["smart"])
@ -471,7 +471,7 @@ with gr.Blocks() as app:
info="Split text into parts and run TTS for each part.", visible=False) info="Split text into parts and run TTS for each part.", visible=False)
edit_word_mode = gr.Radio(label="Edit word mode", choices=["Replace half", "Replace all"], value="Replace half", edit_word_mode = gr.Radio(label="Edit word mode", choices=["Replace half", "Replace all"], value="Replace half",
info="What to do with first and last word", visible=False) info="What to do with first and last word", visible=False)
with gr.Group() as tts_mode_controls: with gr.Group() as tts_mode_controls:
prompt_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[10], interactive=True) prompt_to_word = gr.Dropdown(label="Last word in prompt", choices=demo_words, value=demo_words[10], interactive=True)
prompt_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.93, step=0.001, value=3.016) prompt_end_time = gr.Slider(label="Prompt end time", minimum=0, maximum=7.93, step=0.001, value=3.016)
@ -517,11 +517,11 @@ with gr.Blocks() as app:
codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change') codec_sr = gr.Number(label="codec_sr", value=50, info='encodec specific, Do not change')
silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change") silence_tokens = gr.Textbox(label="silence tokens", value="[1388,1898,131]", info="encodec specific, do not change")
audio_tensors = gr.State() audio_tensors = gr.State()
transcribe_state = gr.State(value={"words_info": demo_words_info}) transcribe_state = gr.State(value={"words_info": demo_words_info})
mode.change(fn=update_demo, mode.change(fn=update_demo,
inputs=[mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word], inputs=[mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word],
outputs=[transcript, edit_from_word, edit_to_word]) outputs=[transcript, edit_from_word, edit_to_word])
@ -531,11 +531,11 @@ with gr.Blocks() as app:
smart_transcript.change(fn=update_demo, smart_transcript.change(fn=update_demo,
inputs=[mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word], inputs=[mode, smart_transcript, edit_word_mode, transcript, edit_from_word, edit_to_word],
outputs=[transcript, edit_from_word, edit_to_word]) outputs=[transcript, edit_from_word, edit_to_word])
load_models_btn.click(fn=load_models, load_models_btn.click(fn=load_models,
inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, voicecraft_model_choice], inputs=[whisper_backend_choice, whisper_model_choice, align_model_choice, voicecraft_model_choice],
outputs=[models_selector]) outputs=[models_selector])
input_audio.upload(fn=update_input_audio, input_audio.upload(fn=update_input_audio,
inputs=[input_audio], inputs=[input_audio],
outputs=[prompt_end_time, edit_start_time, edit_end_time]) outputs=[prompt_end_time, edit_start_time, edit_end_time])
@ -564,7 +564,7 @@ with gr.Blocks() as app:
split_text, sentence_selector, audio_tensors split_text, sentence_selector, audio_tensors
], ],
outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors]) outputs=[output_audio, inference_transcript, sentence_selector, audio_tensors])
sentence_selector.change(fn=load_sentence, sentence_selector.change(fn=load_sentence,
inputs=[sentence_selector, codec_audio_sr, audio_tensors], inputs=[sentence_selector, codec_audio_sr, audio_tensors],
outputs=[sentence_audio]) outputs=[sentence_audio])
@ -580,7 +580,7 @@ with gr.Blocks() as app:
split_text, sentence_selector, audio_tensors split_text, sentence_selector, audio_tensors
], ],
outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors]) outputs=[output_audio, inference_transcript, sentence_audio, audio_tensors])
prompt_to_word.change(fn=update_bound_word, prompt_to_word.change(fn=update_bound_word,
inputs=[gr.State(False), prompt_to_word, gr.State("Replace all")], inputs=[gr.State(False), prompt_to_word, gr.State("Replace all")],
outputs=[prompt_end_time]) outputs=[prompt_end_time])