From 1a896d21fe2866cc99707140c2b533d0eef2c7ce Mon Sep 17 00:00:00 2001 From: Pranay Gosar Date: Fri, 3 May 2024 22:16:06 -0500 Subject: [PATCH] adjust cut off sec and target transcript --- tts_demo.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/tts_demo.py b/tts_demo.py index 86cd506..6f8c347 100644 --- a/tts_demo.py +++ b/tts_demo.py @@ -62,10 +62,13 @@ def parse_arguments(): default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,", help="original transcript") parser.add_argument("-tt", "--target_transcript", type=str, - default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,", + default="object was seen as a mirage in the lake in the distance,", help="target transcript") parser.add_argument("-co", "--cut_off_sec", type=float, default=3.6, help="cut off point in seconds for input prompt") + parser.add_argument("-ma", "--margin", type=float, default=0.07, + help="lowest margin in seconds between words for input prompt") + args = parser.parse_args() return args @@ -135,11 +138,33 @@ os.system("source ~/.bashrc && \ # if the above fails, it could be because the audio is too hard for the alignment model, # increasing the beam size usually solves the issue +def find_closest_word_boundary(alignments, cut_off_sec, margin): + with open(alignments, 'r') as file: + # skip header + next(file) + prev_end = 0.0 + cutoff_time = None + cutoff_index = None + for i, line in enumerate(file): + end = float(line.strip().split(',')[1]) + if end >= cut_off_sec and end - prev_end >= margin: + cutoff_time = end + margin / 2 + cutoff_index = i + break + + prev_end = end + + return cutoff_time, cutoff_index + # take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt -cut_off_sec = args.cut_off_sec # NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio -target_transcript = args.target_transcript -# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec. +# NOTE: according to forced-alignment file demo/temp/mfa_alignments/5895_34622_000026_000002.wav, the word "strength" stop as 3.561 sec, so we use first 3.6 sec as the prompt. this should be different for different audio +cut_off_sec = args.cut_off_sec +margin = args.margin audio_fn = f"{temp_folder}/{filename}.wav" +alignments = f"{temp_folder}/mfa_alignments/{filename}.csv" +cut_off_sec, cut_off_word_idx = find_closest_word_boundary(alignments, cut_off_sec, margin) +target_transcript = " ".join(orig_transcript.split(" ")[:cut_off_word_idx]) + " " + args.target_transcript +# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec. info = torchaudio.info(audio_fn) audio_dur = info.num_frames / info.sample_rate