improve automatic cutoff finding, delete editing script

This commit is contained in:
pyp_l40 2024-05-04 12:25:37 -05:00
parent 1a896d21fe
commit ef9d65433c
4 changed files with 148 additions and 253 deletions

View File

@ -57,7 +57,7 @@ Be sure to first [setup your environment](#environment-setup).
Without arguments, they will run the standard demo arguments used as an example elsewhere
in this repository. You can use the command line arguments to specify unique input audios,
target transcripts, and inference hyperparameters. Run the help command for more information:
`python3 tts_demo.py -h` and `python3 speech-editing_demo.py -h`
`python3 tts_demo.py -h`
## QuickStart Docker
:star: To try out TTS inference with VoiceCraft, you can also use docker. Thank [@ubergarm](https://github.com/ubergarm) and [@jayc88](https://github.com/jay-c88) for making this happen.

View File

@ -0,0 +1,106 @@
Begin,End,Label,Type,Speaker
0.04,0.58,gwynplaine,words,temp
0.58,0.94,had,words,temp
0.94,1.45,besides,words,temp
1.45,1.62,for,words,temp
1.62,1.86,his,words,temp
1.86,2.16,work,words,temp
2.16,2.31,and,words,temp
2.31,2.49,for,words,temp
2.49,2.71,his,words,temp
2.71,3.03,feats,words,temp
3.03,3.12,of,words,temp
3.12,3.61,strength,words,temp
3.95,4.25,round,words,temp
4.25,4.45,his,words,temp
4.45,4.7,neck,words,temp
4.7,4.81,and,words,temp
4.81,5.04,over,words,temp
5.04,5.22,his,words,temp
5.22,5.83,shoulders,words,temp
6.16,6.31,an,words,temp
6.41,7.15,esclavine,words,temp
7.15,7.29,of,words,temp
7.29,7.7,leather,words,temp
0.04,0.1,G,phones,temp
0.1,0.13,W,phones,temp
0.13,0.22,IH1,phones,temp
0.22,0.3,N,phones,temp
0.3,0.38,P,phones,temp
0.38,0.42,L,phones,temp
0.42,0.53,EY1,phones,temp
0.53,0.58,N,phones,temp
0.58,0.71,HH,phones,temp
0.71,0.86,AE1,phones,temp
0.86,0.94,D,phones,temp
0.94,0.97,B,phones,temp
0.97,1.01,IH0,phones,temp
1.01,1.14,S,phones,temp
1.14,1.34,AY1,phones,temp
1.34,1.4,D,phones,temp
1.4,1.45,Z,phones,temp
1.45,1.52,F,phones,temp
1.52,1.55,AO1,phones,temp
1.55,1.62,R,phones,temp
1.62,1.69,HH,phones,temp
1.69,1.76,IH1,phones,temp
1.76,1.86,Z,phones,temp
1.86,1.95,W,phones,temp
1.95,2.07,ER1,phones,temp
2.07,2.16,K,phones,temp
2.16,2.23,AH0,phones,temp
2.23,2.26,N,phones,temp
2.26,2.31,D,phones,temp
2.31,2.38,F,phones,temp
2.38,2.41,AO1,phones,temp
2.41,2.49,R,phones,temp
2.49,2.55,HH,phones,temp
2.55,2.62,IH1,phones,temp
2.62,2.71,Z,phones,temp
2.71,2.8,F,phones,temp
2.8,2.9,IY1,phones,temp
2.9,2.98,T,phones,temp
2.98,3.03,S,phones,temp
3.03,3.07,AH0,phones,temp
3.07,3.12,V,phones,temp
3.12,3.2,S,phones,temp
3.2,3.26,T,phones,temp
3.26,3.32,R,phones,temp
3.32,3.39,EH1,phones,temp
3.39,3.48,NG,phones,temp
3.48,3.53,K,phones,temp
3.53,3.61,TH,phones,temp
3.95,4.03,R,phones,temp
4.03,4.16,AW1,phones,temp
4.16,4.21,N,phones,temp
4.21,4.25,D,phones,temp
4.25,4.29,HH,phones,temp
4.29,4.36,IH1,phones,temp
4.36,4.45,Z,phones,temp
4.45,4.53,N,phones,temp
4.53,4.62,EH1,phones,temp
4.62,4.7,K,phones,temp
4.7,4.74,AH0,phones,temp
4.74,4.77,N,phones,temp
4.77,4.81,D,phones,temp
4.81,4.92,OW1,phones,temp
4.92,4.97,V,phones,temp
4.97,5.04,ER0,phones,temp
5.04,5.11,HH,phones,temp
5.11,5.18,IH1,phones,temp
5.18,5.22,Z,phones,temp
5.22,5.34,SH,phones,temp
5.34,5.47,OW1,phones,temp
5.47,5.51,L,phones,temp
5.51,5.58,D,phones,temp
5.58,5.71,ER0,phones,temp
5.71,5.83,Z,phones,temp
6.16,6.23,AE1,phones,temp
6.23,6.31,N,phones,temp
6.41,7.15,spn,phones,temp
7.15,7.21,AH0,phones,temp
7.21,7.29,V,phones,temp
7.29,7.36,L,phones,temp
7.36,7.44,EH1,phones,temp
7.44,7.49,DH,phones,temp
7.49,7.7,ER0,phones,temp
1 Begin End Label Type Speaker
2 0.04 0.58 gwynplaine words temp
3 0.58 0.94 had words temp
4 0.94 1.45 besides words temp
5 1.45 1.62 for words temp
6 1.62 1.86 his words temp
7 1.86 2.16 work words temp
8 2.16 2.31 and words temp
9 2.31 2.49 for words temp
10 2.49 2.71 his words temp
11 2.71 3.03 feats words temp
12 3.03 3.12 of words temp
13 3.12 3.61 strength words temp
14 3.95 4.25 round words temp
15 4.25 4.45 his words temp
16 4.45 4.7 neck words temp
17 4.7 4.81 and words temp
18 4.81 5.04 over words temp
19 5.04 5.22 his words temp
20 5.22 5.83 shoulders words temp
21 6.16 6.31 an words temp
22 6.41 7.15 esclavine words temp
23 7.15 7.29 of words temp
24 7.29 7.7 leather words temp
25 0.04 0.1 G phones temp
26 0.1 0.13 W phones temp
27 0.13 0.22 IH1 phones temp
28 0.22 0.3 N phones temp
29 0.3 0.38 P phones temp
30 0.38 0.42 L phones temp
31 0.42 0.53 EY1 phones temp
32 0.53 0.58 N phones temp
33 0.58 0.71 HH phones temp
34 0.71 0.86 AE1 phones temp
35 0.86 0.94 D phones temp
36 0.94 0.97 B phones temp
37 0.97 1.01 IH0 phones temp
38 1.01 1.14 S phones temp
39 1.14 1.34 AY1 phones temp
40 1.34 1.4 D phones temp
41 1.4 1.45 Z phones temp
42 1.45 1.52 F phones temp
43 1.52 1.55 AO1 phones temp
44 1.55 1.62 R phones temp
45 1.62 1.69 HH phones temp
46 1.69 1.76 IH1 phones temp
47 1.76 1.86 Z phones temp
48 1.86 1.95 W phones temp
49 1.95 2.07 ER1 phones temp
50 2.07 2.16 K phones temp
51 2.16 2.23 AH0 phones temp
52 2.23 2.26 N phones temp
53 2.26 2.31 D phones temp
54 2.31 2.38 F phones temp
55 2.38 2.41 AO1 phones temp
56 2.41 2.49 R phones temp
57 2.49 2.55 HH phones temp
58 2.55 2.62 IH1 phones temp
59 2.62 2.71 Z phones temp
60 2.71 2.8 F phones temp
61 2.8 2.9 IY1 phones temp
62 2.9 2.98 T phones temp
63 2.98 3.03 S phones temp
64 3.03 3.07 AH0 phones temp
65 3.07 3.12 V phones temp
66 3.12 3.2 S phones temp
67 3.2 3.26 T phones temp
68 3.26 3.32 R phones temp
69 3.32 3.39 EH1 phones temp
70 3.39 3.48 NG phones temp
71 3.48 3.53 K phones temp
72 3.53 3.61 TH phones temp
73 3.95 4.03 R phones temp
74 4.03 4.16 AW1 phones temp
75 4.16 4.21 N phones temp
76 4.21 4.25 D phones temp
77 4.25 4.29 HH phones temp
78 4.29 4.36 IH1 phones temp
79 4.36 4.45 Z phones temp
80 4.45 4.53 N phones temp
81 4.53 4.62 EH1 phones temp
82 4.62 4.7 K phones temp
83 4.7 4.74 AH0 phones temp
84 4.74 4.77 N phones temp
85 4.77 4.81 D phones temp
86 4.81 4.92 OW1 phones temp
87 4.92 4.97 V phones temp
88 4.97 5.04 ER0 phones temp
89 5.04 5.11 HH phones temp
90 5.11 5.18 IH1 phones temp
91 5.18 5.22 Z phones temp
92 5.22 5.34 SH phones temp
93 5.34 5.47 OW1 phones temp
94 5.47 5.51 L phones temp
95 5.51 5.58 D phones temp
96 5.58 5.71 ER0 phones temp
97 5.71 5.83 Z phones temp
98 6.16 6.23 AE1 phones temp
99 6.23 6.31 N phones temp
100 6.41 7.15 spn phones temp
101 7.15 7.21 AH0 phones temp
102 7.21 7.29 V phones temp
103 7.29 7.36 L phones temp
104 7.36 7.44 EH1 phones temp
105 7.44 7.49 DH phones temp
106 7.49 7.7 ER0 phones temp

View File

@ -1,220 +0,0 @@
"""
This script will allow you to run Speech Editing inference with Voicecraft
Before getting started, be sure to follow the environment setup.
"""
from inference_speech_editing_scale import inference_one_sample, get_mask_interval
from edit_utils import get_span
from models import voicecraft
from data.tokenizer import (
AudioTokenizer,
TextTokenizer,
)
import argparse
import random
import numpy as np
import torchaudio
import torch
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["USER"] = "me" # TODO change this to your username
device = "cuda" if torch.cuda.is_available() else "cpu"
def parse_arguments():
parser = argparse.ArgumentParser(
description="VoiceCraft Speech Editing: see the script for more information on the options")
parser.add_argument("-m", "--model_name", type=str, default="giga330M.pth", choices=[
"giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"],
help="VoiceCraft model to use")
parser.add_argument("-st", "--silence_tokens", type=int, nargs="*",
default=[1388, 1898, 131], help="Silence token IDs")
parser.add_argument("-lm", "--left_margin", type=float,
default=0.08, help="Left margin value.")
parser.add_argument("-rm", "--right_margin", type=float,
default=0.08, help="Right margin value.")
parser.add_argument("-casr", "--codec_audio_sr", type=int,
default=16000, help="Codec audio sample rate.")
parser.add_argument("-csr", "--codec_sr", type=int, default=50,
help="Codec sample rate.")
parser.add_argument("-k", "--top_k", type=float,
default=0, help="Top k value.")
parser.add_argument("-p", "--top_p", type=float,
default=0.8, help="Top p value.")
parser.add_argument("-t", "--temperature", type=float,
default=1, help="Temperature value.")
parser.add_argument("-kv", "--kvcache", type=float, choices=[0, 1],
default=0, help="Kvcache value.")
parser.add_argument("-sr", "--stop_repetition", type=int,
default=-1, help="Stop repetition for generation")
parser.add_argument("-s", "--seed", type=int, default=1, help="Seed value.")
parser.add_argument("-bs", "--beam_size", type=int, default=10,
help="beam size for MFA alignment")
parser.add_argument("-rbs", "--retry_beam_size", type=int, default=40,
help="retry beam size for MFA alignment")
parser.add_argument("-oa", "--original_audio", type=str,
default="./demo/84_121550_000074_000000.wav", help="location of audio file")
parser.add_argument("-ot", "--original_transcript", type=str,
default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,",
help="original transcript")
parser.add_argument("-tt", "--target_transcript", type=str,
default="But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,",
help="target transcript")
parser.add_argument("-et", "--edit_type", type=str,
default="substitution",
choices=["insertion", "substitution", "deletion"],
help="type of specified edit")
parser.add_argument("-o", "--output_dir", type=str,
default="./demo/generated_se", help="output directory")
args = parser.parse_args()
return args
args = parse_arguments()
voicecraft_name = args.model_name
# hyperparameters for inference
left_margin = args.left_margin
right_margin = args.right_margin
codec_audio_sr = args.codec_audio_sr
codec_sr = args.codec_sr
top_k = args.top_k
top_p = args.top_p
temperature = args.temperature
kvcache = args.kvcache
# NOTE: adjust the below three arguments if the generation is not as good
seed = args.seed # random seed magic
silence_tokens = args.silence_tokens
# if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1
stop_repetition = args.stop_repetition
# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest
def seed_everything(seed):
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
seed_everything(seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
# or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth
model = voicecraft.VoiceCraft.from_pretrained(
f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}")
phn2num = model.args.phn2num
config = vars(model.args)
model.to(device)
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
if not os.path.exists(encodec_fn):
os.system(
f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
os.system(
f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")
# will also put the neural codec model on gpu
audio_tokenizer = AudioTokenizer(signature=encodec_fn)
text_tokenizer = TextTokenizer(backend="espeak")
# point to the original file or record the file
# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
orig_audio = args.original_audio
orig_transcript = args.original_transcript
# move the audio and transcript to temp folder
temp_folder = "./demo/temp"
os.makedirs(temp_folder, exist_ok=True)
os.system(f"cp {orig_audio} {temp_folder}")
filename = os.path.splitext(orig_audio.split("/")[-1])[0]
with open(f"{temp_folder}/{filename}.txt", "w") as f:
f.write(orig_transcript)
# run MFA to get the alignment
align_temp = f"{temp_folder}/mfa_alignments"
os.makedirs(align_temp, exist_ok=True)
beam_size = args.beam_size
retry_beam_size = args.retry_beam_size
os.system("source ~/.bashrc && \
conda activate voicecraft && \
mfa align -v --clean -j 1 --output_format csv {temp_folder} \
english_us_arpa english_us_arpa {align_temp} --beam {beam_size} --retry_beam {retry_beam_size}"
)
# if it fail, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue
# os.system(f"mfa align -j 1 --clean --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000")
audio_fn = f"{temp_folder}/{filename}.wav"
transcript_fn = f"{temp_folder}/{filename}.txt"
align_fn = f"{align_temp}/{filename}.csv"
# propose what do you want the target modified transcript to be
target_transcript = args.target_transcript
edit_type = args.edit_type
# if you want to do a second modification on top of the first one, write down the second modification (target_transcript2, type_of_modification2)
# make sure the two modification do not overlap, if they do, you need to combine them into one modification
# run the script to turn user input to the format that the model can take
orig_span, new_span = get_span(orig_transcript, target_transcript, edit_type)
if orig_span[0] > orig_span[1]:
RuntimeError(f"example {audio_fn} failed")
if orig_span[0] == orig_span[1]:
orig_span_save = [orig_span[0]]
else:
orig_span_save = orig_span
if new_span[0] == new_span[1]:
new_span_save = [new_span[0]]
else:
new_span_save = new_span
orig_span_save = ",".join([str(item) for item in orig_span_save])
new_span_save = ",".join([str(item) for item in new_span_save])
start, end = get_mask_interval(align_fn, orig_span_save, edit_type)
info = torchaudio.info(audio_fn)
audio_dur = info.num_frames / info.sample_rate
morphed_span = (max(start - left_margin, 1/codec_sr),
min(end + right_margin, audio_dur)) # in seconds
# span in codec frames
mask_interval = [[round(morphed_span[0]*codec_sr),
round(morphed_span[1]*codec_sr)]]
mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now
# run the model to get the output
decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition,
'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens}
orig_audio, new_audio = inference_one_sample(model, argparse.Namespace(
**config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, mask_interval, device, decode_config)
# save segments for comparison
orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu()
# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")
# save the audio
output_dir = args.output_dir
os.makedirs(output_dir, exist_ok=True)
save_fn_new = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{seed}.wav"
torchaudio.save(save_fn_new, new_audio, codec_audio_sr)
save_fn_orig = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav"
if not os.path.isfile(save_fn_orig):
orig_audio, orig_sr = torchaudio.load(audio_fn)
if orig_sr != codec_audio_sr:
orig_audio = torchaudio.transforms.Resample(
orig_sr, codec_audio_sr)(orig_audio)
torchaudio.save(save_fn_orig, orig_audio, codec_audio_sr)
# # if you get error importing T5 in transformers
# # try
# # pip uninstall Pillow
# # pip install Pillow
# # you are likely to get warning looks like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored

View File

@ -15,8 +15,6 @@ import numpy as np
import torchaudio
import torch
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["USER"] = "me" # TODO change this to your username
device = "cuda" if torch.cuda.is_available() else "cpu"
@ -26,8 +24,8 @@ def parse_arguments():
parser = argparse.ArgumentParser(
description="VoiceCraft TTS Inference: see the script for more information on the options")
parser.add_argument("-m", "--model_name", type=str, default="giga330M.pth", choices=[
"giga330M.pth", "gigaHalfLibri330M_TTSEnhanced_max16s.pth", "giga830M.pth"],
parser.add_argument("-m", "--model_name", type=str, default="giga830M", choices=[
"giga330M", "giga830M", "giga330M_TTSEnhanced", "giga830M_TTSEnhanced"],
help="VoiceCraft model to use")
parser.add_argument("-st", "--silence_tokens", type=int, nargs="*",
default=[1388, 1898, 131], help="Silence token IDs")
@ -50,24 +48,25 @@ def parse_arguments():
default=3, help="Batch size for sampling")
parser.add_argument("-s", "--seed", type=int,
default=1, help="Seed value.")
parser.add_argument("-bs", "--beam_size", type=int, default=10,
parser.add_argument("-bs", "--beam_size", type=int, default=50,
help="beam size for MFA alignment")
parser.add_argument("-rbs", "--retry_beam_size", type=int, default=40,
parser.add_argument("-rbs", "--retry_beam_size", type=int, default=200,
help="retry beam size for MFA alignment")
parser.add_argument("--output_dir", type=str, default="./generated_tts",
help="directory to save generated audio")
parser.add_argument("-oa", "--original_audio", type=str,
default="./demo/84_121550_000074_000000.wav", help="location of audio file")
default="./demo/5895_34622_000026_000002.wav", help="location of audio file")
parser.add_argument("-ot", "--original_transcript", type=str,
default="But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,",
default="Gwynplaine had, besides, for his work and for his feats of strength, round his neck and over his shoulders, an esclavine of leather.",
help="original transcript")
parser.add_argument("-tt", "--target_transcript", type=str,
default="object was seen as a mirage in the lake in the distance,",
default="I cannot believe that the same model can also do text to speech synthesis too!",
help="target transcript")
parser.add_argument("-co", "--cut_off_sec", type=float, default=3.6,
help="cut off point in seconds for input prompt")
parser.add_argument("-ma", "--margin", type=float, default=0.07,
help="lowest margin in seconds between words for input prompt")
parser.add_argument("-ma", "--margin", type=float, default=0.04,
help="margin in seconds between the end of the cutoff words and the start of the next word. If the next word is not immediately following the cutoff word, the algorithm is more tolerant to word alignment errors")
parser.add_argument("-cuttol", "--cutoff_tolerance", type=float, default=1, help="tolerance in seconds for the cutoff time, if given cut_off_sec plus the tolerance, we still are not able to find the next word, we will use the best cutoff time found, i.e. likely no margin or very small margin between the end of the cutoff word and the start of the next word")
args = parser.parse_args()
return args
@ -96,6 +95,14 @@ sample_batch_size = args.sample_batch_size
seed = args.seed # change seed if you are still unhappy with the result
# load the model
if voicecraft_name == "330M":
voicecraft_name = "giga330M"
elif voicecraft_name == "830M":
voicecraft_name = "giga830M"
elif voicecraft_name == "330M_TTSEnhanced":
voicecraft_name = "330M_TTSEnhanced"
elif voicecraft_name == "830M_TTSEnhanced":
voicecraft_name = "830M_TTSEnhanced"
model = voicecraft.VoiceCraft.from_pretrained(
f"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}")
phn2num = model.args.phn2num
@ -105,9 +112,7 @@ model.to(device)
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
if not os.path.exists(encodec_fn):
os.system(
f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
os.system(
f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")
f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th -O ./pretrained_models/encodec_4cb2048_giga.th")
# will also put the neural codec model on gpu
audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device)
@ -130,30 +135,34 @@ with open(f"{temp_folder}/{filename}.txt", "w") as f:
align_temp = f"{temp_folder}/mfa_alignments"
beam_size = args.beam_size
retry_beam_size = args.retry_beam_size
os.system("source ~/.bashrc && \
conda activate voicecraft && \
mfa align -v --clean -j 1 --output_format csv {temp_folder} \
english_us_arpa english_us_arpa {align_temp} --beam {beam_size} --retry_beam {retry_beam_size}"
)
alignments = f"{temp_folder}/mfa_alignments/{filename}.csv"
if not os.path.isfile(alignments):
os.system(f"mfa align -v --clean -j 1 --output_format csv {temp_folder} \
english_us_arpa english_us_arpa {align_temp} --beam {beam_size} --retry_beam {retry_beam_size}")
# if the above fails, it could be because the audio is too hard for the alignment model,
# increasing the beam size usually solves the issue
# increasing the beam_size and retry_beam_size usually solves the issue
def find_closest_word_boundary(alignments, cut_off_sec, margin):
def find_closest_word_boundary(alignments, cut_off_sec, margin, cutoff_tolerance = 1):
with open(alignments, 'r') as file:
# skip header
next(file)
prev_end = 0.0
cutoff_time = None
cutoff_index = None
for i, line in enumerate(file):
cutoff_time_best = None
cutoff_index_best = None
lines = [l for l in file.readlines()]
for i, line in enumerate(lines):
end = float(line.strip().split(',')[1])
if end >= cut_off_sec and end - prev_end >= margin:
cutoff_time = end + margin / 2
if end >= cut_off_sec and cutoff_time == None:
cutoff_time = end
cutoff_index = i
break
prev_end = end
if end >= cut_off_sec and end < cut_off_sec + cutoff_tolerance and float(lines[i+1].strip().split(',')[0]) - end >= margin:
cutoff_time_best = end + margin * 2 / 3
cutoff_index_best = i
break
if cutoff_time_best != None:
cutoff_time = cutoff_time_best
cutoff_index = cutoff_index_best
return cutoff_time, cutoff_index
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
@ -161,9 +170,9 @@ def find_closest_word_boundary(alignments, cut_off_sec, margin):
cut_off_sec = args.cut_off_sec
margin = args.margin
audio_fn = f"{temp_folder}/{filename}.wav"
alignments = f"{temp_folder}/mfa_alignments/{filename}.csv"
cut_off_sec, cut_off_word_idx = find_closest_word_boundary(alignments, cut_off_sec, margin)
target_transcript = " ".join(orig_transcript.split(" ")[:cut_off_word_idx]) + " " + args.target_transcript
cut_off_sec, cut_off_word_idx = find_closest_word_boundary(alignments, cut_off_sec, margin, args.cutoff_tolerance)
target_transcript = " ".join(orig_transcript.split(" ")[:cut_off_word_idx+1]) + " " + args.target_transcript
# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.
info = torchaudio.info(audio_fn)
audio_dur = info.num_frames / info.sample_rate