From 1e0eaeba2becbf7a632f29971c96b1ac380a2af7 Mon Sep 17 00:00:00 2001 From: pgosar Date: Wed, 17 Apr 2024 16:32:50 -0500 Subject: [PATCH 1/8] add files --- inference_demo.py | 1 + speech_editing_demo.py | 1 + 2 files changed, 2 insertions(+) create mode 100644 inference_demo.py create mode 100644 speech_editing_demo.py diff --git a/inference_demo.py b/inference_demo.py new file mode 100644 index 0000000..85e6ff1 --- /dev/null +++ b/inference_demo.py @@ -0,0 +1 @@ +# WIP diff --git a/speech_editing_demo.py b/speech_editing_demo.py new file mode 100644 index 0000000..85e6ff1 --- /dev/null +++ b/speech_editing_demo.py @@ -0,0 +1 @@ +# WIP From 63736f72696f1492beb7e0406c4e7a736a9f2eb1 Mon Sep 17 00:00:00 2001 From: Pranay Gosar Date: Tue, 23 Apr 2024 13:01:44 -0500 Subject: [PATCH 2/8] add TTS --- inference_demo.py | 185 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 184 insertions(+), 1 deletion(-) diff --git a/inference_demo.py b/inference_demo.py index 85e6ff1..bf14294 100644 --- a/inference_demo.py +++ b/inference_demo.py @@ -1 +1,184 @@ -# WIP +""" +This script will allow you to run TTS inference with Voicecraft +Before getting started, be sure to follow the environment setup. +""" + +from inference_tts_scale import inference_one_sample +from models import voicecraft +from data.tokenizer import ( + AudioTokenizer, + TextTokenizer, +) +from IPython.display import display, Audio +import argparse +import random +import numpy as np +import torchaudio +import torch +import os +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["USER"] = "me" # TODO change this to your username + +device = "cuda" if torch.cuda.is_available() else "cpu" + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="VoiceCraft Inference: see the script for more information on the options") + + parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[ + "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"], + help="VoiceCraft model to use") + parser.add_argument("--codec_audio_sr", type=int, + default=16000, help="Audio sampling rate for the codec") + parser.add_argument("--codec_sr", type=int, default=50, + help="Sampling rate for the codec") + parser.add_argument("--top_k", type=int, default=0, + help="Top-k sampling value") + parser.add_argument("--top_p", type=float, default=0.9, + help="Top-p sampling value") + parser.add_argument("--temperature", type=float, + default=1.0, help="Temperature for sampling") + parser.add_argument("--silence_tokens", type=int, nargs="*", + default=[1388, 1898, 131], help="Silence token IDs") + parser.add_argument("--kvcache", type=int, default=1, + help="Key-value cache flag (0 or 1)") + parser.add_argument("--stop_repetition", type=int, + default=3, help="Stop repetition for generation") + parser.add_argument("--sample_batch_size", type=int, + default=3, help="Batch size for sampling") + parser.add_argument("--seed", type=int, default=1, + help="Random seed for reproducibility") + parser.add_argument("--output_dir", type=str, default="./generated_tts", + help="directory to save generated audio") + parser.add_argument("--original_audio", type=str, + default="./demo/84_121550_000074_000000.wav", help="location of target audio file") + parser.add_argument("--original_transcript", type=str, + default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,", + help="original audio transcript") + parser.add_argument("--target_transcript", type=str, + default="Gwynplaine had, besides, for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!", + help="target audio transcript") + parser.add_argument("--cut_off_sec", type=float, default=3.6, + help="cut off point in seconds for input prompt") + args = parser.parse_args() + return args + + +args = parse_arguments() + +voicecraft_name = args.model_name +# hyperparameters for inference +codec_audio_sr = args.codec_audio_sr +codec_sr = args.codec_sr +top_k = args.top_k +top_p = args.top_p # defaults to 0.9 can also try 0.8, but 0.9 seems to work better +temperature = args.temperature +silence_tokens = args.silence_tokens +kvcache = args.kvcache # NOTE if OOM, change this to 0, or try the 330M model + +# NOTE adjust the below three arguments if the generation is not as good +# NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1 +stop_repetition = args.stop_repetition + +# NOTE: if the if there are long silence or unnaturally strecthed words, +# increase sample_batch_size to 4 or higher. What this will do to the model is that the +# model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. +# So if the speech rate of the generated is too fast change it to a smaller number. +sample_batch_size = args.sample_batch_size +seed = args.seed # change seed if you are still unhappy with the result + +# load the model +model = voicecraft.VoiceCraft.from_pretrained( + f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}") +phn2num = model.args.phn2num +config = vars(model.args) +model.to(device) + +encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th" +if not os.path.exists(encodec_fn): + os.system( + f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th") + os.system( + f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th") +# will also put the neural codec model on gpu +audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device) + +text_tokenizer = TextTokenizer(backend="espeak") + +# Prepare your audio +# point to the original audio whose speech you want to clone +# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file +orig_audio = args.original_audio +orig_transcript = args.original_transcript + +# move the audio and transcript to temp folder +temp_folder = "./demo/temp" +os.makedirs(temp_folder, exist_ok=True) +os.system(f"cp {orig_audio} {temp_folder}") +filename = os.path.splitext(orig_audio.split("/")[-1])[0] +with open(f"{temp_folder}/{filename}.txt", "w") as f: + f.write(orig_transcript) +# run MFA to get the alignment +align_temp = f"{temp_folder}/mfa_alignments" + +os.system("source ~/.bashrc && \ + conda activate voicecraft && \ + mfa align -v --clean -j 1 --output_format csv {temp_folder} \ + english_us_arpa english_us_arpa {align_temp}" + ) + +# # if the above fails, it could be because the audio is too hard for the alignment model, +# increasing the beam size usually solves the issue +# os.system("source ~/.bashrc && \ +# conda activate voicecraft && \ +# mfa align -v --clean -j 1 --output_format csv {temp_folder} \ +# english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000") + +# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt +cut_off_sec = args.cut_off_sec # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio +target_transcript = args.target_transcript +# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec. +audio_fn = f"{temp_folder}/{filename}.wav" +info = torchaudio.info(audio_fn) +audio_dur = info.num_frames / info.sample_rate + +assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}" +prompt_end_frame = int(cut_off_sec * info.sample_rate) + +# run the model to get the output + + +def seed_everything(seed): + os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + +seed_everything(seed) + +decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, + "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size} +concated_audio, gen_audio = inference_one_sample(model, argparse.Namespace( + **config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame) + +# save segments for comparison +concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu() +# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}") + +# save the audio +# output_dir +output_dir = args.output_dir +os.makedirs(output_dir, exist_ok=True) +seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav" +seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav" + +torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr) +torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr) + +# you might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored From b8bb2ab592f6146d882f16cd7aab0869aea45889 Mon Sep 17 00:00:00 2001 From: Pranay Gosar Date: Tue, 23 Apr 2024 15:25:43 -0500 Subject: [PATCH 3/8] add beam size cmd args --- inference_demo.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/inference_demo.py b/inference_demo.py index bf14294..7e438ae 100644 --- a/inference_demo.py +++ b/inference_demo.py @@ -50,6 +50,11 @@ def parse_arguments(): default=3, help="Batch size for sampling") parser.add_argument("--seed", type=int, default=1, help="Random seed for reproducibility") + parser.add_argument("--beam_size", type=int, default=10, + help="beam size for MFA alignment") + parser.add_argument("--retry_beam_size", type=int, default=40, + help="retry beam size for MFA alignment") + parser.add_argument("--output_dir", type=str, default="./generated_tts", help="directory to save generated audio") parser.add_argument("--original_audio", type=str, @@ -67,7 +72,6 @@ def parse_arguments(): args = parse_arguments() - voicecraft_name = args.model_name # hyperparameters for inference codec_audio_sr = args.codec_audio_sr @@ -122,19 +126,15 @@ with open(f"{temp_folder}/{filename}.txt", "w") as f: f.write(orig_transcript) # run MFA to get the alignment align_temp = f"{temp_folder}/mfa_alignments" - +beam_size = args.beam_size +retry_beam_size = args.retry_beam_size os.system("source ~/.bashrc && \ conda activate voicecraft && \ mfa align -v --clean -j 1 --output_format csv {temp_folder} \ - english_us_arpa english_us_arpa {align_temp}" + english_us_arpa english_us_arpa {align_temp} --beam {beam_size} --retry_beam {retry_beam_size}" ) - -# # if the above fails, it could be because the audio is too hard for the alignment model, +# if the above fails, it could be because the audio is too hard for the alignment model, # increasing the beam size usually solves the issue -# os.system("source ~/.bashrc && \ -# conda activate voicecraft && \ -# mfa align -v --clean -j 1 --output_format csv {temp_folder} \ -# english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000") # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt cut_off_sec = args.cut_off_sec # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio From 59877c085e49b61cb441bb0624b3fbd842da2d43 Mon Sep 17 00:00:00 2001 From: Pranay Gosar Date: Tue, 23 Apr 2024 18:38:09 -0500 Subject: [PATCH 4/8] add speech editing --- inference_demo.py | 16 ++- speech_editing_demo.py | 220 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 225 insertions(+), 11 deletions(-) diff --git a/inference_demo.py b/inference_demo.py index 7e438ae..7c9f62e 100644 --- a/inference_demo.py +++ b/inference_demo.py @@ -9,7 +9,6 @@ from data.tokenizer import ( AudioTokenizer, TextTokenizer, ) -from IPython.display import display, Audio import argparse import random import numpy as np @@ -25,7 +24,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu" def parse_arguments(): parser = argparse.ArgumentParser( - description="VoiceCraft Inference: see the script for more information on the options") + description="VoiceCraft TTS Inference: see the script for more information on the options") parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[ "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"], @@ -34,15 +33,15 @@ def parse_arguments(): default=16000, help="Audio sampling rate for the codec") parser.add_argument("--codec_sr", type=int, default=50, help="Sampling rate for the codec") - parser.add_argument("--top_k", type=int, default=0, - help="Top-k sampling value") + parser.add_argument("--top_k", type=float, default=0, + help="Top-k value") parser.add_argument("--top_p", type=float, default=0.9, - help="Top-p sampling value") + help="Top-p value") parser.add_argument("--temperature", type=float, default=1.0, help="Temperature for sampling") parser.add_argument("--silence_tokens", type=int, nargs="*", default=[1388, 1898, 131], help="Silence token IDs") - parser.add_argument("--kvcache", type=int, default=1, + parser.add_argument("--kvcache", type=int, default=1, choices=[0, 1], help="Key-value cache flag (0 or 1)") parser.add_argument("--stop_repetition", type=int, default=3, help="Stop repetition for generation") @@ -54,7 +53,6 @@ def parse_arguments(): help="beam size for MFA alignment") parser.add_argument("--retry_beam_size", type=int, default=40, help="retry beam size for MFA alignment") - parser.add_argument("--output_dir", type=str, default="./generated_tts", help="directory to save generated audio") parser.add_argument("--original_audio", type=str, @@ -147,9 +145,6 @@ audio_dur = info.num_frames / info.sample_rate assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}" prompt_end_frame = int(cut_off_sec * info.sample_rate) -# run the model to get the output - - def seed_everything(seed): os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) @@ -162,6 +157,7 @@ def seed_everything(seed): seed_everything(seed) +# inference decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size} concated_audio, gen_audio = inference_one_sample(model, argparse.Namespace( diff --git a/speech_editing_demo.py b/speech_editing_demo.py index 85e6ff1..220d342 100644 --- a/speech_editing_demo.py +++ b/speech_editing_demo.py @@ -1 +1,219 @@ -# WIP +""" +This script will allow you to run Speech Editing inference with Voicecraft +Before getting started, be sure to follow the environment setup. +""" + +from inference_speech_editing_scale import inference_one_sample, get_mask_interval +from edit_utils import get_span +from models import voicecraft +from data.tokenizer import ( + AudioTokenizer, + TextTokenizer, +) +import argparse +import random +import numpy as np +import torchaudio +import torch +import os +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +os.environ["USER"] = "me" # TODO change this to your username + +device = "cuda" if torch.cuda.is_available() else "cpu" + + +def parse_arguments(): + parser = argparse.ArgumentParser( + description="VoiceCraft Speech Editing: see the script for more information on the options") + + parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[ + "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"], + help="VoiceCraft model to use") + parser.add_argument("--silence_tokens", type=int, nargs="*", + default=[1388, 1898, 131], help="Silence token IDs") + parser.add_argument("--left_margin", type=float, + default=0.08, help="Left margin value.") + parser.add_argument("--right_margin", type=float, + default=0.08, help="Right margin value.") + parser.add_argument("--codec_audio_sr", type=int, + default=16000, help="Codec audio sample rate.") + parser.add_argument("--codec_sr", type=int, default=50, + help="Codec sample rate.") + parser.add_argument("--top_k", type=float, default=0, help="Top k value.") + parser.add_argument("--top_p", type=float, + default=0.8, help="Top p value.") + parser.add_argument("--temperature", type=float, + default=1, help="Temperature value.") + parser.add_argument("--kvcache", type=float, + default=0, help="Kvcache value.") + parser.add_argument("--seed", type=int, default=1, help="Seed value.") + parser.add_argument("--beam_size", type=int, default=10, + help="beam size for MFA alignment") + parser.add_argument("--retry_beam_size", type=int, default=40, + help="retry beam size for MFA alignment") + parser.add_argument("--original_audio", type=str, + default="./demo/84_121550_000074_000000.wav", help="location of audio file") + parser.add_argument("--stop_repetition", type=int, + default=-1, help="Stop repetition for generation") + parser.add_argument("--original_transcript", type=str, + default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,", + help="original transcript") + parser.add_argument("--target_transcript", type=str, + default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,", + help="target transcript") + parser.add_argument("--edit_type", type=str, + default="substitution", + choices=["insertion", "substitution", "deletion"], + help="type of specified edit") + parser.add_argument("--output_dir", type=str, + default="./demo/generated_se", help="output directory") + args = parser.parse_args() + return args + + +args = parse_arguments() + +voicecraft_name = args.model_name + +# hyperparameters for inference +left_margin = args.left_margin +right_margin = args.right_margin +codec_audio_sr = args.codec_audio_sr +codec_sr = args.codec_sr +top_k = args.top_k +top_p = args.top_p +temperature = args.temperature +kvcache = args.kvcache +# NOTE: adjust the below three arguments if the generation is not as good +seed = args.seed # random seed magic +silence_tokens = args.silence_tokens +# if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1 +stop_repetition = args.stop_repetition +# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest + + +def seed_everything(seed): + os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + + +seed_everything(seed) +device = "cuda" if torch.cuda.is_available() else "cpu" +# or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth +model = voicecraft.VoiceCraft.from_pretrained( + f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}") +phn2num = model.args.phn2num +config = vars(model.args) +model.to(device) + +encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th" +if not os.path.exists(encodec_fn): + os.system( + f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th") + os.system( + f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th") +# will also put the neural codec model on gpu +audio_tokenizer = AudioTokenizer(signature=encodec_fn) + +text_tokenizer = TextTokenizer(backend="espeak") + +# point to the original file or record the file +# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file +orig_audio = args.original_audio +orig_transcript = args.original_transcript +# move the audio and transcript to temp folder +temp_folder = "./demo/temp" +os.makedirs(temp_folder, exist_ok=True) +os.system(f"cp {orig_audio} {temp_folder}") +filename = os.path.splitext(orig_audio.split("/")[-1])[0] +with open(f"{temp_folder}/{filename}.txt", "w") as f: + f.write(orig_transcript) +# run MFA to get the alignment +align_temp = f"{temp_folder}/mfa_alignments" +os.makedirs(align_temp, exist_ok=True) +beam_size = args.beam_size +retry_beam_size = args.retry_beam_size + +os.system("source ~/.bashrc && \ + conda activate voicecraft && \ + mfa align -v --clean -j 1 --output_format csv {temp_folder} \ + english_us_arpa english_us_arpa {align_temp} --beam {beam_size} --retry_beam {retry_beam_size}" + ) +# if it fail, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue +# os.system(f"mfa align -j 1 --clean --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000") +audio_fn = f"{temp_folder}/{filename}.wav" +transcript_fn = f"{temp_folder}/{filename}.txt" +align_fn = f"{align_temp}/{filename}.csv" + +# propose what do you want the target modified transcript to be +target_transcript = args.target_transcript +edit_type = args.edit_type + +# if you want to do a second modification on top of the first one, write down the second modification (target_transcript2, type_of_modification2) +# make sure the two modification do not overlap, if they do, you need to combine them into one modification + +# run the script to turn user input to the format that the model can take +orig_span, new_span = get_span(orig_transcript, target_transcript, edit_type) +if orig_span[0] > orig_span[1]: + RuntimeError(f"example {audio_fn} failed") +if orig_span[0] == orig_span[1]: + orig_span_save = [orig_span[0]] +else: + orig_span_save = orig_span +if new_span[0] == new_span[1]: + new_span_save = [new_span[0]] +else: + new_span_save = new_span + +orig_span_save = ",".join([str(item) for item in orig_span_save]) +new_span_save = ",".join([str(item) for item in new_span_save]) + +start, end = get_mask_interval(align_fn, orig_span_save, edit_type) +info = torchaudio.info(audio_fn) +audio_dur = info.num_frames / info.sample_rate +morphed_span = (max(start - left_margin, 1/codec_sr), + min(end + right_margin, audio_dur)) # in seconds + +# span in codec frames +mask_interval = [[round(morphed_span[0]*codec_sr), + round(morphed_span[1]*codec_sr)]] +mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now + +# run the model to get the output + +decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, + 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens} +orig_audio, new_audio = inference_one_sample(model, argparse.Namespace( + **config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, mask_interval, device, decode_config) + +# save segments for comparison +orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu() +# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}") + +# save the audio +output_dir = args.output_dir +os.makedirs(output_dir, exist_ok=True) + +save_fn_new = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{seed}.wav" + +torchaudio.save(save_fn_new, new_audio, codec_audio_sr) + +save_fn_orig = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav" +if not os.path.isfile(save_fn_orig): + orig_audio, orig_sr = torchaudio.load(audio_fn) + if orig_sr != codec_audio_sr: + orig_audio = torchaudio.transforms.Resample( + orig_sr, codec_audio_sr)(orig_audio) + torchaudio.save(save_fn_orig, orig_audio, codec_audio_sr) + +# # if you get error importing T5 in transformers +# # try +# # pip uninstall Pillow +# # pip install Pillow +# # you are likely to get warning looks like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored From 1850da92104c264ea96b84b0cc886c10608127a7 Mon Sep 17 00:00:00 2001 From: Pranay Gosar Date: Tue, 23 Apr 2024 18:55:34 -0500 Subject: [PATCH 5/8] add short form commands --- inference_demo.py | 58 ++++++++++++++++++++++-------------------- speech_editing_demo.py | 45 ++++++++++++++++---------------- 2 files changed, 53 insertions(+), 50 deletions(-) diff --git a/inference_demo.py b/inference_demo.py index 7c9f62e..86cd506 100644 --- a/inference_demo.py +++ b/inference_demo.py @@ -26,44 +26,45 @@ def parse_arguments(): parser = argparse.ArgumentParser( description="VoiceCraft TTS Inference: see the script for more information on the options") - parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[ + parser.add_argument("-m", "--model_name", type=str, default="giga330M.pth", choices=[ "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"], help="VoiceCraft model to use") - parser.add_argument("--codec_audio_sr", type=int, - default=16000, help="Audio sampling rate for the codec") - parser.add_argument("--codec_sr", type=int, default=50, - help="Sampling rate for the codec") - parser.add_argument("--top_k", type=float, default=0, - help="Top-k value") - parser.add_argument("--top_p", type=float, default=0.9, - help="Top-p value") - parser.add_argument("--temperature", type=float, - default=1.0, help="Temperature for sampling") - parser.add_argument("--silence_tokens", type=int, nargs="*", + parser.add_argument("-st", "--silence_tokens", type=int, nargs="*", default=[1388, 1898, 131], help="Silence token IDs") - parser.add_argument("--kvcache", type=int, default=1, choices=[0, 1], - help="Key-value cache flag (0 or 1)") - parser.add_argument("--stop_repetition", type=int, - default=3, help="Stop repetition for generation") + parser.add_argument("-casr", "--codec_audio_sr", type=int, + default=16000, help="Codec audio sample rate.") + parser.add_argument("-csr", "--codec_sr", type=int, default=50, + help="Codec sample rate.") + + parser.add_argument("-k", "--top_k", type=float, + default=0, help="Top k value.") + parser.add_argument("-p", "--top_p", type=float, + default=0.8, help="Top p value.") + parser.add_argument("-t", "--temperature", type=float, + default=1, help="Temperature value.") + parser.add_argument("-kv", "--kvcache", type=float, choices=[0, 1], + default=0, help="Kvcache value.") + parser.add_argument("-sr", "--stop_repetition", type=int, + default=-1, help="Stop repetition for generation") parser.add_argument("--sample_batch_size", type=int, default=3, help="Batch size for sampling") - parser.add_argument("--seed", type=int, default=1, - help="Random seed for reproducibility") - parser.add_argument("--beam_size", type=int, default=10, + parser.add_argument("-s", "--seed", type=int, + default=1, help="Seed value.") + parser.add_argument("-bs", "--beam_size", type=int, default=10, help="beam size for MFA alignment") - parser.add_argument("--retry_beam_size", type=int, default=40, + parser.add_argument("-rbs", "--retry_beam_size", type=int, default=40, help="retry beam size for MFA alignment") parser.add_argument("--output_dir", type=str, default="./generated_tts", help="directory to save generated audio") - parser.add_argument("--original_audio", type=str, - default="./demo/84_121550_000074_000000.wav", help="location of target audio file") - parser.add_argument("--original_transcript", type=str, + parser.add_argument("-oa", "--original_audio", type=str, + default="./demo/84_121550_000074_000000.wav", help="location of audio file") + parser.add_argument("-ot", "--original_transcript", type=str, default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,", - help="original audio transcript") - parser.add_argument("--target_transcript", type=str, - default="Gwynplaine had, besides, for his work and for his feats of strength, I cannot believe that the same model can also do text to speech synthesis too!", - help="target audio transcript") - parser.add_argument("--cut_off_sec", type=float, default=3.6, + help="original transcript") + parser.add_argument("-tt", "--target_transcript", type=str, + default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,", + help="target transcript") + parser.add_argument("-co", "--cut_off_sec", type=float, default=3.6, help="cut off point in seconds for input prompt") args = parser.parse_args() return args @@ -145,6 +146,7 @@ audio_dur = info.num_frames / info.sample_rate assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}" prompt_end_frame = int(cut_off_sec * info.sample_rate) + def seed_everything(seed): os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) diff --git a/speech_editing_demo.py b/speech_editing_demo.py index 220d342..99c24f5 100644 --- a/speech_editing_demo.py +++ b/speech_editing_demo.py @@ -27,46 +27,47 @@ def parse_arguments(): parser = argparse.ArgumentParser( description="VoiceCraft Speech Editing: see the script for more information on the options") - parser.add_argument("--model_name", type=str, default="giga330M.pth", choices=[ + parser.add_argument("-m", "--model_name", type=str, default="giga330M.pth", choices=[ "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"], help="VoiceCraft model to use") - parser.add_argument("--silence_tokens", type=int, nargs="*", + parser.add_argument("-st", "--silence_tokens", type=int, nargs="*", default=[1388, 1898, 131], help="Silence token IDs") - parser.add_argument("--left_margin", type=float, + parser.add_argument("-lm", "--left_margin", type=float, default=0.08, help="Left margin value.") - parser.add_argument("--right_margin", type=float, + parser.add_argument("-rm", "--right_margin", type=float, default=0.08, help="Right margin value.") - parser.add_argument("--codec_audio_sr", type=int, + parser.add_argument("-casr", "--codec_audio_sr", type=int, default=16000, help="Codec audio sample rate.") - parser.add_argument("--codec_sr", type=int, default=50, + parser.add_argument("-csr", "--codec_sr", type=int, default=50, help="Codec sample rate.") - parser.add_argument("--top_k", type=float, default=0, help="Top k value.") - parser.add_argument("--top_p", type=float, + parser.add_argument("-k", "--top_k", type=float, + default=0, help="Top k value.") + parser.add_argument("-p", "--top_p", type=float, default=0.8, help="Top p value.") - parser.add_argument("--temperature", type=float, + parser.add_argument("-t", "--temperature", type=float, default=1, help="Temperature value.") - parser.add_argument("--kvcache", type=float, + parser.add_argument("-kv", "--kvcache", type=float, choices=[0, 1], default=0, help="Kvcache value.") - parser.add_argument("--seed", type=int, default=1, help="Seed value.") - parser.add_argument("--beam_size", type=int, default=10, - help="beam size for MFA alignment") - parser.add_argument("--retry_beam_size", type=int, default=40, - help="retry beam size for MFA alignment") - parser.add_argument("--original_audio", type=str, - default="./demo/84_121550_000074_000000.wav", help="location of audio file") - parser.add_argument("--stop_repetition", type=int, + parser.add_argument("-sr", "--stop_repetition", type=int, default=-1, help="Stop repetition for generation") - parser.add_argument("--original_transcript", type=str, + parser.add_argument("-s", "--seed", type=int, default=1, help="Seed value.") + parser.add_argument("-bs", "--beam_size", type=int, default=10, + help="beam size for MFA alignment") + parser.add_argument("-rbs", "--retry_beam_size", type=int, default=40, + help="retry beam size for MFA alignment") + parser.add_argument("-oa", "--original_audio", type=str, + default="./demo/84_121550_000074_000000.wav", help="location of audio file") + parser.add_argument("-ot", "--original_transcript", type=str, default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,", help="original transcript") - parser.add_argument("--target_transcript", type=str, + parser.add_argument("-tt", "--target_transcript", type=str, default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,", help="target transcript") - parser.add_argument("--edit_type", type=str, + parser.add_argument("-et", "--edit_type", type=str, default="substitution", choices=["insertion", "substitution", "deletion"], help="type of specified edit") - parser.add_argument("--output_dir", type=str, + parser.add_argument("-o", "--output_dir", type=str, default="./demo/generated_se", help="output directory") args = parser.parse_args() return args From 9fb6d948d0d86a624b815288afda92bac5fe839a Mon Sep 17 00:00:00 2001 From: pgosar Date: Tue, 23 Apr 2024 19:07:24 -0500 Subject: [PATCH 6/8] add simple running instructions --- README.md | 19 ++++++++++++++----- inference_demo.py => tts_demo.py | 0 2 files changed, 14 insertions(+), 5 deletions(-) rename inference_demo.py => tts_demo.py (100%) diff --git a/README.md b/README.md index d7c6d86..129c0bf 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,8 @@ There are three ways (besides running Gradio in Colab): 1. More flexible inference beyond Gradio UI in Google Colab. see [quickstart colab](#quickstart-colab) 2. with docker. see [quickstart docker](#quickstart-docker) 3. without docker. see [environment setup](#environment-setup). You can also run gradio locally if you choose this option +4. As a standalone script that you can easily integrate into other projects. +see [quickstart command line](#quickstart-command-line). When you are inside the docker image or you have installed all dependencies, Checkout [`inference_tts.ipynb`](./inference_tts.ipynb). @@ -21,7 +23,7 @@ If you want to do model development such as training/finetuning, I recommend fol ## News :star: 04/22/2024: 330M/830M TTS Enhanced Models are up [here](https://huggingface.co/pyp1), load them through [`gradio_app.py`](./gradio_app.py) or [`inference_tts.ipynb`](./inference_tts.ipynb)! Replicate demo is up, major thanks to [@chenxwh](https://github.com/chenxwh)! -:star: 04/11/2024: VoiceCraft Gradio is now available on HuggingFace Spaces [here](https://huggingface.co/spaces/pyp1/VoiceCraft_gradio)! Major thanks to [@zuev-stepan](https://github.com/zuev-stepan), [@Sewlell](https://github.com/Sewlell), [@pgsoar](https://github.com/pgosar) [@Ph0rk0z](https://github.com/Ph0rk0z). +:star: 04/11/2024: VoiceCraft Gradio is now available on HuggingFace Spaces [here](https://huggingface.co/spaces/pyp1/VoiceCraft_gradio)! Major thanks to [@zuev-stepan](https://github.com/zuev-stepan), [@Sewlell](https://github.com/Sewlell), [@pgsoar](https://github.com/pgosar) [@Ph0rk0z](https://github.com/Ph0rk0z). :star: 04/05/2024: I finetuned giga330M with the TTS objective on gigaspeech and 1/5 of librilight. Weights are [here](https://huggingface.co/pyp1/VoiceCraft/tree/main). Make sure maximal prompt + generation length <= 16 seconds (due to our limited compute, we had to drop utterances longer than 16s in training data). Even stronger models forthcomming, stay tuned! @@ -37,11 +39,9 @@ If you want to do model development such as training/finetuning, I recommend fol - [x] Better guidance on training/finetuning - [x] Colab notebooks - [x] HuggingFace Spaces demo -- [ ] Command line +- [x] Command line - [ ] Improve efficiency - - ## QuickStart Colab :star: To try out speech editing or TTS Inference with VoiceCraft, the simplest way is using Google Colab. @@ -50,6 +50,15 @@ Instructions to run are on the Colab itself. 1. To try [Speech Editing](https://colab.research.google.com/drive/1FV7EC36dl8UioePY1xXijXTMl7X47kR_?usp=sharing) 2. To try [TTS Inference](https://colab.research.google.com/drive/1lch_6it5-JpXgAQlUTRRI2z2_rk5K67Z?usp=sharing) +## QuickStart Command Line + +:star: To use it as a standalone script, check out tts_demo.py and speech_editing_demo.py. +Be sure to first [setup your environment](#environment-setup). +Without arguments, they will run the standard demo arguments used as an example elsewhere +in this repository. You can use the command line arguments to specify unique input audios, +target transcripts, and inference hyperparameters. Run the help command for more information: +`python3 tts_demo.py -h` and `python3 speech-editing_demo.py -h` + ## QuickStart Docker :star: To try out TTS inference with VoiceCraft, you can also use docker. Thank [@ubergarm](https://github.com/ubergarm) and [@jayc88](https://github.com/jay-c88) for making this happen. @@ -197,7 +206,7 @@ cd ./z_scripts bash e830M.sh ``` -It's the same procedure to prepare your own custom dataset. Make sure that if +It's the same procedure to prepare your own custom dataset. Make sure that if ## Finetuning You also need to do step 1-4 as Training, and I recommend to use AdamW for optimization if you finetune a pretrained model for better stability. checkout script `./z_scripts/e830M_ft.sh`. diff --git a/inference_demo.py b/tts_demo.py similarity index 100% rename from inference_demo.py rename to tts_demo.py From 1a896d21fe2866cc99707140c2b533d0eef2c7ce Mon Sep 17 00:00:00 2001 From: Pranay Gosar Date: Fri, 3 May 2024 22:16:06 -0500 Subject: [PATCH 7/8] adjust cut off sec and target transcript --- tts_demo.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/tts_demo.py b/tts_demo.py index 86cd506..6f8c347 100644 --- a/tts_demo.py +++ b/tts_demo.py @@ -62,10 +62,13 @@ def parse_arguments(): default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,", help="original transcript") parser.add_argument("-tt", "--target_transcript", type=str, - default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,", + default="object was seen as a mirage in the lake in the distance,", help="target transcript") parser.add_argument("-co", "--cut_off_sec", type=float, default=3.6, help="cut off point in seconds for input prompt") + parser.add_argument("-ma", "--margin", type=float, default=0.07, + help="lowest margin in seconds between words for input prompt") + args = parser.parse_args() return args @@ -135,11 +138,33 @@ os.system("source ~/.bashrc && \ # if the above fails, it could be because the audio is too hard for the alignment model, # increasing the beam size usually solves the issue +def find_closest_word_boundary(alignments, cut_off_sec, margin): + with open(alignments, 'r') as file: + # skip header + next(file) + prev_end = 0.0 + cutoff_time = None + cutoff_index = None + for i, line in enumerate(file): + end = float(line.strip().split(',')[1]) + if end >= cut_off_sec and end - prev_end >= margin: + cutoff_time = end + margin / 2 + cutoff_index = i + break + + prev_end = end + + return cutoff_time, cutoff_index + # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt -cut_off_sec = args.cut_off_sec # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio -target_transcript = args.target_transcript -# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec. +# NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio +cut_off_sec = args.cut_off_sec +margin = args.margin audio_fn = f"{temp_folder}/{filename}.wav" +alignments = f"{temp_folder}/mfa_alignments/{filename}.csv" +cut_off_sec, cut_off_word_idx = find_closest_word_boundary(alignments, cut_off_sec, margin) +target_transcript = " ".join(orig_transcript.split(" ")[:cut_off_word_idx]) + " " + args.target_transcript +# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec. info = torchaudio.info(audio_fn) audio_dur = info.num_frames / info.sample_rate From ef9d65433cb3f5a490fa9f855355579da6a0e955 Mon Sep 17 00:00:00 2001 From: pyp_l40 Date: Sat, 4 May 2024 12:25:37 -0500 Subject: [PATCH 8/8] improve automatic cutoff finding, delete editing script --- README.md | 2 +- .../5895_34622_000026_000002.csv | 106 +++++++++ speech_editing_demo.py | 220 ------------------ tts_demo.py | 73 +++--- 4 files changed, 148 insertions(+), 253 deletions(-) create mode 100644 demo/temp/mfa_alignments/5895_34622_000026_000002.csv delete mode 100644 speech_editing_demo.py diff --git a/README.md b/README.md index 129c0bf..ae7c95b 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Be sure to first [setup your environment](#environment-setup). Without arguments, they will run the standard demo arguments used as an example elsewhere in this repository. You can use the command line arguments to specify unique input audios, target transcripts, and inference hyperparameters. Run the help command for more information: -`python3 tts_demo.py -h` and `python3 speech-editing_demo.py -h` +`python3 tts_demo.py -h` ## QuickStart Docker :star: To try out TTS inference with VoiceCraft, you can also use docker. Thank [@ubergarm](https://github.com/ubergarm) and [@jayc88](https://github.com/jay-c88) for making this happen. diff --git a/demo/temp/mfa_alignments/5895_34622_000026_000002.csv b/demo/temp/mfa_alignments/5895_34622_000026_000002.csv new file mode 100644 index 0000000..81a998d --- /dev/null +++ b/demo/temp/mfa_alignments/5895_34622_000026_000002.csv @@ -0,0 +1,106 @@ +Begin,End,Label,Type,Speaker +0.04,0.58,gwynplaine,words,temp +0.58,0.94,had,words,temp +0.94,1.45,besides,words,temp +1.45,1.62,for,words,temp +1.62,1.86,his,words,temp +1.86,2.16,work,words,temp +2.16,2.31,and,words,temp +2.31,2.49,for,words,temp +2.49,2.71,his,words,temp +2.71,3.03,feats,words,temp +3.03,3.12,of,words,temp +3.12,3.61,strength,words,temp +3.95,4.25,round,words,temp +4.25,4.45,his,words,temp +4.45,4.7,neck,words,temp +4.7,4.81,and,words,temp +4.81,5.04,over,words,temp +5.04,5.22,his,words,temp +5.22,5.83,shoulders,words,temp +6.16,6.31,an,words,temp +6.41,7.15,esclavine,words,temp +7.15,7.29,of,words,temp +7.29,7.7,leather,words,temp +0.04,0.1,G,phones,temp +0.1,0.13,W,phones,temp +0.13,0.22,IH1,phones,temp +0.22,0.3,N,phones,temp +0.3,0.38,P,phones,temp +0.38,0.42,L,phones,temp +0.42,0.53,EY1,phones,temp +0.53,0.58,N,phones,temp +0.58,0.71,HH,phones,temp +0.71,0.86,AE1,phones,temp +0.86,0.94,D,phones,temp +0.94,0.97,B,phones,temp +0.97,1.01,IH0,phones,temp +1.01,1.14,S,phones,temp +1.14,1.34,AY1,phones,temp +1.34,1.4,D,phones,temp +1.4,1.45,Z,phones,temp +1.45,1.52,F,phones,temp +1.52,1.55,AO1,phones,temp +1.55,1.62,R,phones,temp +1.62,1.69,HH,phones,temp +1.69,1.76,IH1,phones,temp +1.76,1.86,Z,phones,temp +1.86,1.95,W,phones,temp +1.95,2.07,ER1,phones,temp +2.07,2.16,K,phones,temp +2.16,2.23,AH0,phones,temp +2.23,2.26,N,phones,temp +2.26,2.31,D,phones,temp +2.31,2.38,F,phones,temp +2.38,2.41,AO1,phones,temp +2.41,2.49,R,phones,temp +2.49,2.55,HH,phones,temp +2.55,2.62,IH1,phones,temp +2.62,2.71,Z,phones,temp +2.71,2.8,F,phones,temp +2.8,2.9,IY1,phones,temp +2.9,2.98,T,phones,temp +2.98,3.03,S,phones,temp +3.03,3.07,AH0,phones,temp +3.07,3.12,V,phones,temp +3.12,3.2,S,phones,temp +3.2,3.26,T,phones,temp +3.26,3.32,R,phones,temp +3.32,3.39,EH1,phones,temp +3.39,3.48,NG,phones,temp +3.48,3.53,K,phones,temp +3.53,3.61,TH,phones,temp +3.95,4.03,R,phones,temp +4.03,4.16,AW1,phones,temp +4.16,4.21,N,phones,temp +4.21,4.25,D,phones,temp +4.25,4.29,HH,phones,temp +4.29,4.36,IH1,phones,temp +4.36,4.45,Z,phones,temp +4.45,4.53,N,phones,temp +4.53,4.62,EH1,phones,temp +4.62,4.7,K,phones,temp +4.7,4.74,AH0,phones,temp +4.74,4.77,N,phones,temp +4.77,4.81,D,phones,temp +4.81,4.92,OW1,phones,temp +4.92,4.97,V,phones,temp +4.97,5.04,ER0,phones,temp +5.04,5.11,HH,phones,temp +5.11,5.18,IH1,phones,temp +5.18,5.22,Z,phones,temp +5.22,5.34,SH,phones,temp +5.34,5.47,OW1,phones,temp +5.47,5.51,L,phones,temp +5.51,5.58,D,phones,temp +5.58,5.71,ER0,phones,temp +5.71,5.83,Z,phones,temp +6.16,6.23,AE1,phones,temp +6.23,6.31,N,phones,temp +6.41,7.15,spn,phones,temp +7.15,7.21,AH0,phones,temp +7.21,7.29,V,phones,temp +7.29,7.36,L,phones,temp +7.36,7.44,EH1,phones,temp +7.44,7.49,DH,phones,temp +7.49,7.7,ER0,phones,temp diff --git a/speech_editing_demo.py b/speech_editing_demo.py deleted file mode 100644 index 99c24f5..0000000 --- a/speech_editing_demo.py +++ /dev/null @@ -1,220 +0,0 @@ -""" -This script will allow you to run Speech Editing inference with Voicecraft -Before getting started, be sure to follow the environment setup. -""" - -from inference_speech_editing_scale import inference_one_sample, get_mask_interval -from edit_utils import get_span -from models import voicecraft -from data.tokenizer import ( - AudioTokenizer, - TextTokenizer, -) -import argparse -import random -import numpy as np -import torchaudio -import torch -import os -os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -os.environ["CUDA_VISIBLE_DEVICES"] = "0" -os.environ["USER"] = "me" # TODO change this to your username - -device = "cuda" if torch.cuda.is_available() else "cpu" - - -def parse_arguments(): - parser = argparse.ArgumentParser( - description="VoiceCraft Speech Editing: see the script for more information on the options") - - parser.add_argument("-m", "--model_name", type=str, default="giga330M.pth", choices=[ - "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"], - help="VoiceCraft model to use") - parser.add_argument("-st", "--silence_tokens", type=int, nargs="*", - default=[1388, 1898, 131], help="Silence token IDs") - parser.add_argument("-lm", "--left_margin", type=float, - default=0.08, help="Left margin value.") - parser.add_argument("-rm", "--right_margin", type=float, - default=0.08, help="Right margin value.") - parser.add_argument("-casr", "--codec_audio_sr", type=int, - default=16000, help="Codec audio sample rate.") - parser.add_argument("-csr", "--codec_sr", type=int, default=50, - help="Codec sample rate.") - parser.add_argument("-k", "--top_k", type=float, - default=0, help="Top k value.") - parser.add_argument("-p", "--top_p", type=float, - default=0.8, help="Top p value.") - parser.add_argument("-t", "--temperature", type=float, - default=1, help="Temperature value.") - parser.add_argument("-kv", "--kvcache", type=float, choices=[0, 1], - default=0, help="Kvcache value.") - parser.add_argument("-sr", "--stop_repetition", type=int, - default=-1, help="Stop repetition for generation") - parser.add_argument("-s", "--seed", type=int, default=1, help="Seed value.") - parser.add_argument("-bs", "--beam_size", type=int, default=10, - help="beam size for MFA alignment") - parser.add_argument("-rbs", "--retry_beam_size", type=int, default=40, - help="retry beam size for MFA alignment") - parser.add_argument("-oa", "--original_audio", type=str, - default="./demo/84_121550_000074_000000.wav", help="location of audio file") - parser.add_argument("-ot", "--original_transcript", type=str, - default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,", - help="original transcript") - parser.add_argument("-tt", "--target_transcript", type=str, - default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,", - help="target transcript") - parser.add_argument("-et", "--edit_type", type=str, - default="substitution", - choices=["insertion", "substitution", "deletion"], - help="type of specified edit") - parser.add_argument("-o", "--output_dir", type=str, - default="./demo/generated_se", help="output directory") - args = parser.parse_args() - return args - - -args = parse_arguments() - -voicecraft_name = args.model_name - -# hyperparameters for inference -left_margin = args.left_margin -right_margin = args.right_margin -codec_audio_sr = args.codec_audio_sr -codec_sr = args.codec_sr -top_k = args.top_k -top_p = args.top_p -temperature = args.temperature -kvcache = args.kvcache -# NOTE: adjust the below three arguments if the generation is not as good -seed = args.seed # random seed magic -silence_tokens = args.silence_tokens -# if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1 -stop_repetition = args.stop_repetition -# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest - - -def seed_everything(seed): - os.environ['PYTHONHASHSEED'] = str(seed) - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed(seed) - torch.backends.cudnn.benchmark = False - torch.backends.cudnn.deterministic = True - - -seed_everything(seed) -device = "cuda" if torch.cuda.is_available() else "cpu" -# or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth -model = voicecraft.VoiceCraft.from_pretrained( - f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}") -phn2num = model.args.phn2num -config = vars(model.args) -model.to(device) - -encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th" -if not os.path.exists(encodec_fn): - os.system( - f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th") - os.system( - f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th") -# will also put the neural codec model on gpu -audio_tokenizer = AudioTokenizer(signature=encodec_fn) - -text_tokenizer = TextTokenizer(backend="espeak") - -# point to the original file or record the file -# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file -orig_audio = args.original_audio -orig_transcript = args.original_transcript -# move the audio and transcript to temp folder -temp_folder = "./demo/temp" -os.makedirs(temp_folder, exist_ok=True) -os.system(f"cp {orig_audio} {temp_folder}") -filename = os.path.splitext(orig_audio.split("/")[-1])[0] -with open(f"{temp_folder}/{filename}.txt", "w") as f: - f.write(orig_transcript) -# run MFA to get the alignment -align_temp = f"{temp_folder}/mfa_alignments" -os.makedirs(align_temp, exist_ok=True) -beam_size = args.beam_size -retry_beam_size = args.retry_beam_size - -os.system("source ~/.bashrc && \ - conda activate voicecraft && \ - mfa align -v --clean -j 1 --output_format csv {temp_folder} \ - english_us_arpa english_us_arpa {align_temp} --beam {beam_size} --retry_beam {retry_beam_size}" - ) -# if it fail, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue -# os.system(f"mfa align -j 1 --clean --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000") -audio_fn = f"{temp_folder}/{filename}.wav" -transcript_fn = f"{temp_folder}/{filename}.txt" -align_fn = f"{align_temp}/{filename}.csv" - -# propose what do you want the target modified transcript to be -target_transcript = args.target_transcript -edit_type = args.edit_type - -# if you want to do a second modification on top of the first one, write down the second modification (target_transcript2, type_of_modification2) -# make sure the two modification do not overlap, if they do, you need to combine them into one modification - -# run the script to turn user input to the format that the model can take -orig_span, new_span = get_span(orig_transcript, target_transcript, edit_type) -if orig_span[0] > orig_span[1]: - RuntimeError(f"example {audio_fn} failed") -if orig_span[0] == orig_span[1]: - orig_span_save = [orig_span[0]] -else: - orig_span_save = orig_span -if new_span[0] == new_span[1]: - new_span_save = [new_span[0]] -else: - new_span_save = new_span - -orig_span_save = ",".join([str(item) for item in orig_span_save]) -new_span_save = ",".join([str(item) for item in new_span_save]) - -start, end = get_mask_interval(align_fn, orig_span_save, edit_type) -info = torchaudio.info(audio_fn) -audio_dur = info.num_frames / info.sample_rate -morphed_span = (max(start - left_margin, 1/codec_sr), - min(end + right_margin, audio_dur)) # in seconds - -# span in codec frames -mask_interval = [[round(morphed_span[0]*codec_sr), - round(morphed_span[1]*codec_sr)]] -mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now - -# run the model to get the output - -decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, - 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens} -orig_audio, new_audio = inference_one_sample(model, argparse.Namespace( - **config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, mask_interval, device, decode_config) - -# save segments for comparison -orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu() -# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}") - -# save the audio -output_dir = args.output_dir -os.makedirs(output_dir, exist_ok=True) - -save_fn_new = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{seed}.wav" - -torchaudio.save(save_fn_new, new_audio, codec_audio_sr) - -save_fn_orig = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav" -if not os.path.isfile(save_fn_orig): - orig_audio, orig_sr = torchaudio.load(audio_fn) - if orig_sr != codec_audio_sr: - orig_audio = torchaudio.transforms.Resample( - orig_sr, codec_audio_sr)(orig_audio) - torchaudio.save(save_fn_orig, orig_audio, codec_audio_sr) - -# # if you get error importing T5 in transformers -# # try -# # pip uninstall Pillow -# # pip install Pillow -# # you are likely to get warning looks like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored diff --git a/tts_demo.py b/tts_demo.py index 6f8c347..c1d97ce 100644 --- a/tts_demo.py +++ b/tts_demo.py @@ -15,8 +15,6 @@ import numpy as np import torchaudio import torch import os -os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -os.environ["CUDA_VISIBLE_DEVICES"] = "0" os.environ["USER"] = "me" # TODO change this to your username device = "cuda" if torch.cuda.is_available() else "cpu" @@ -26,8 +24,8 @@ def parse_arguments(): parser = argparse.ArgumentParser( description="VoiceCraft TTS Inference: see the script for more information on the options") - parser.add_argument("-m", "--model_name", type=str, default="giga330M.pth", choices=[ - "giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"], + parser.add_argument("-m", "--model_name", type=str, default="giga830M", choices=[ + "giga330M", "giga830M", "giga330M_TTSEnhanced", "giga830M_TTSEnhanced"], help="VoiceCraft model to use") parser.add_argument("-st", "--silence_tokens", type=int, nargs="*", default=[1388, 1898, 131], help="Silence token IDs") @@ -50,24 +48,25 @@ def parse_arguments(): default=3, help="Batch size for sampling") parser.add_argument("-s", "--seed", type=int, default=1, help="Seed value.") - parser.add_argument("-bs", "--beam_size", type=int, default=10, + parser.add_argument("-bs", "--beam_size", type=int, default=50, help="beam size for MFA alignment") - parser.add_argument("-rbs", "--retry_beam_size", type=int, default=40, + parser.add_argument("-rbs", "--retry_beam_size", type=int, default=200, help="retry beam size for MFA alignment") parser.add_argument("--output_dir", type=str, default="./generated_tts", help="directory to save generated audio") parser.add_argument("-oa", "--original_audio", type=str, - default="./demo/84_121550_000074_000000.wav", help="location of audio file") + default="./demo/5895_34622_000026_000002.wav", help="location of audio file") parser.add_argument("-ot", "--original_transcript", type=str, - default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,", + default="Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather.", help="original transcript") parser.add_argument("-tt", "--target_transcript", type=str, - default="object was seen as a mirage in the lake in the distance,", + default="I cannot believe that the same model can also do text to speech synthesis too!", help="target transcript") parser.add_argument("-co", "--cut_off_sec", type=float, default=3.6, help="cut off point in seconds for input prompt") - parser.add_argument("-ma", "--margin", type=float, default=0.07, - help="lowest margin in seconds between words for input prompt") + parser.add_argument("-ma", "--margin", type=float, default=0.04, + help="margin in seconds between the end of the cutoff words and the start of the next word. If the next word is not immediately following the cutoff word, the algorithm is more tolerant to word alignment errors") + parser.add_argument("-cuttol", "--cutoff_tolerance", type=float, default=1, help="tolerance in seconds for the cutoff time, if given cut_off_sec plus the tolerance, we still are not able to find the next word, we will use the best cutoff time found, i.e. likely no margin or very small margin between the end of the cutoff word and the start of the next word") args = parser.parse_args() return args @@ -96,6 +95,14 @@ sample_batch_size = args.sample_batch_size seed = args.seed # change seed if you are still unhappy with the result # load the model +if voicecraft_name == "330M": + voicecraft_name = "giga330M" +elif voicecraft_name == "830M": + voicecraft_name = "giga830M" +elif voicecraft_name == "330M_TTSEnhanced": + voicecraft_name = "330M_TTSEnhanced" +elif voicecraft_name == "830M_TTSEnhanced": + voicecraft_name = "830M_TTSEnhanced" model = voicecraft.VoiceCraft.from_pretrained( f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}") phn2num = model.args.phn2num @@ -105,9 +112,7 @@ model.to(device) encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th" if not os.path.exists(encodec_fn): os.system( - f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th") - os.system( - f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th") + f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th -O ./pretrained_models/encodec_4cb2048_giga.th") # will also put the neural codec model on gpu audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device) @@ -130,30 +135,34 @@ with open(f"{temp_folder}/{filename}.txt", "w") as f: align_temp = f"{temp_folder}/mfa_alignments" beam_size = args.beam_size retry_beam_size = args.retry_beam_size -os.system("source ~/.bashrc && \ - conda activate voicecraft && \ - mfa align -v --clean -j 1 --output_format csv {temp_folder} \ - english_us_arpa english_us_arpa {align_temp} --beam {beam_size} --retry_beam {retry_beam_size}" - ) +alignments = f"{temp_folder}/mfa_alignments/{filename}.csv" +if not os.path.isfile(alignments): + os.system(f"mfa align -v --clean -j 1 --output_format csv {temp_folder} \ + english_us_arpa english_us_arpa {align_temp} --beam {beam_size} --retry_beam {retry_beam_size}") # if the above fails, it could be because the audio is too hard for the alignment model, -# increasing the beam size usually solves the issue +# increasing the beam_size and retry_beam_size usually solves the issue -def find_closest_word_boundary(alignments, cut_off_sec, margin): +def find_closest_word_boundary(alignments, cut_off_sec, margin, cutoff_tolerance = 1): with open(alignments, 'r') as file: # skip header next(file) - prev_end = 0.0 cutoff_time = None cutoff_index = None - for i, line in enumerate(file): + cutoff_time_best = None + cutoff_index_best = None + lines = [l for l in file.readlines()] + for i, line in enumerate(lines): end = float(line.strip().split(',')[1]) - if end >= cut_off_sec and end - prev_end >= margin: - cutoff_time = end + margin / 2 + if end >= cut_off_sec and cutoff_time == None: + cutoff_time = end cutoff_index = i - break - - prev_end = end - + if end >= cut_off_sec and end < cut_off_sec + cutoff_tolerance and float(lines[i+1].strip().split(',')[0]) - end >= margin: + cutoff_time_best = end + margin * 2 / 3 + cutoff_index_best = i + break + if cutoff_time_best != None: + cutoff_time = cutoff_time_best + cutoff_index = cutoff_index_best return cutoff_time, cutoff_index # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt @@ -161,9 +170,9 @@ def find_closest_word_boundary(alignments, cut_off_sec, margin): cut_off_sec = args.cut_off_sec margin = args.margin audio_fn = f"{temp_folder}/{filename}.wav" -alignments = f"{temp_folder}/mfa_alignments/{filename}.csv" -cut_off_sec, cut_off_word_idx = find_closest_word_boundary(alignments, cut_off_sec, margin) -target_transcript = " ".join(orig_transcript.split(" ")[:cut_off_word_idx]) + " " + args.target_transcript + +cut_off_sec, cut_off_word_idx = find_closest_word_boundary(alignments, cut_off_sec, margin, args.cutoff_tolerance) +target_transcript = " ".join(orig_transcript.split(" ")[:cut_off_word_idx+1]) + " " + args.target_transcript # NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec. info = torchaudio.info(audio_fn) audio_dur = info.num_frames / info.sample_rate