VoiceCraft Inference Text To Speech Demo
===
This will install a bunch of garbage all over so consider using a docker container to contain the cruft.

Run the next 5 cells one at a time then change the Jupyter Notebook Kernel to use the voicecraft environment.

In [None]:
# install OS deps
!sudo apt-get update && sudo apt-get install -y \
    git-core \
    ffmpeg \
    espeak-ng

In [None]:
# Update and setup Conda voicecraft environment
!conda update -y -n base -c conda-forge conda
!conda create -y -n voicecraft python=3.9.16 && \
    conda init bash

In [None]:
# install conda and pip stuff in the activated conda above context
!echo -e "Grab a cup a coffee and a slice of pizza...\n\n"

# make sure $HOME and $USER are setup so this will source the conda environment
!source ~/.bashrc && \
    conda activate voicecraft && \
    conda install -y -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 && \
    pip install torch==2.0.1 && \
    pip install tensorboard==2.16.2 && \
    pip install phonemizer==3.2.1 && \
    pip install torchaudio==2.0.2 && \
    pip install datasets==2.16.0 && \
    pip install torchmetrics==0.11.1

# do this one last otherwise you'll get an error about torch compiler missing due to xformer mismatch
!source ~/.bashrc && \
    conda activate voicecraft && \
    pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft

In [None]:
# okay setup the conda environment such that jupyter notebook can find the kernel
!source ~/.bashrc && \
    conda activate voicecraft && \
    conda install -y -n voicecraft ipykernel --update-deps --force-reinstall

# STOP
You have to do this part manually using the mouse/keyboard and the tabs at the top.

* Kernel -> Change Kernel -> Select Kernel -> voicecraft
* Kernel -> Restart Kernel -> Yes

Now you can run the rest of the notebook and get an audio sample output. It will download more models and such.

In [None]:
# import libs
# if this throws an error, something went wrong installing dependencies or changing the kernel above!
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import torch
import torchaudio

from data.tokenizer import (
    AudioTokenizer,
    TextTokenizer,
)

In [None]:
# hyperparameters for inference
left_margin = 0.08 # not used for TTS, only for speech editing
right_margin = 0.08 # not used for TTS, only for speech editing
codec_audio_sr = 16000
codec_sr = 50
top_k = 0
top_p = 0.8
temperature = 1
kvcache = 1
silence_tokens=[1388,1898,131]
# adjust the below three arguments if the generation is not as good
seed = 1 # random seed magic
stop_repetition = 3 # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1
sample_batch_size = 4 # if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4
# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest
device = "cuda" if torch.cuda.is_available() else "cpu"

# point to the original file or record the file
# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file
orig_audio = "./demo/84_121550_000074_000000.wav"
orig_transcript = "But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,"

# move the audio and transcript to temp folder
temp_folder = "./demo/temp"
os.makedirs(temp_folder, exist_ok=True)
os.system(f"cp {orig_audio} {temp_folder}")
filename = os.path.splitext(orig_audio.split("/")[-1])[0]
with open(f"{temp_folder}/{filename}.txt", "w") as f:
    f.write(orig_transcript)
# run MFA to get the alignment
align_temp = f"{temp_folder}/mfa_alignments"
os.makedirs(align_temp, exist_ok=True)

# get into the conda environment and download the needed MFA models
!source ~/.bashrc && \
    conda activate voicecraft && \
    mfa model download dictionary english_us_arpa && \
    mfa model download acoustic english_us_arpa

os.system(f". ~/.bashrc && conda activate voicecraft && mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}")

# if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue
# os.system(f"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000")
audio_fn = f"{temp_folder}/{filename}.wav"
transcript_fn = f"{temp_folder}/{filename}.txt"
align_fn = f"{align_temp}/{filename}.csv"

In [None]:
# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt
cut_off_sec = 3.01 # NOTE: according to forced-alignment file, the word "common" stop as 3.01 sec, this should be different for different audio
target_transcript = "But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!"
info = torchaudio.info(audio_fn)
audio_dur = info.num_frames / info.sample_rate

assert cut_off_sec < audio_dur, f"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}"
prompt_end_frame = int(cut_off_sec * info.sample_rate)


# # load model, tokenizer, and other necessary files
from models import voicecraft
#import models.voicecraft as voicecraft
voicecraft_name="giga830M.pth"
ckpt_fn =f"./pretrained_models/{voicecraft_name}"
encodec_fn = "./pretrained_models/encodec_4cb2048_giga.th"
if not os.path.exists(ckpt_fn):
    os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\?download\=true")
    os.system(f"mv {voicecraft_name}\?download\=true ./pretrained_models/{voicecraft_name}")
if not os.path.exists(encodec_fn):
    os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
    os.system(f"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th")

ckpt = torch.load(ckpt_fn, map_location="cpu")
model = voicecraft.VoiceCraft(ckpt["config"])
model.load_state_dict(ckpt["model"])
model.to(device)
model.eval()

phn2num = ckpt['phn2num']

text_tokenizer = TextTokenizer(backend="espeak")
audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu

# run the model to get the output
decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, "codec_audio_sr": codec_audio_sr, "codec_sr": codec_sr, "silence_tokens": silence_tokens, "sample_batch_size": sample_batch_size}
from inference_tts_scale import inference_one_sample
concated_audio, gen_audio = inference_one_sample(model, ckpt["config"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)
        
# save segments for comparison
concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()
# logging.info(f"length of the resynthesize orig audio: {orig_audio.shape}")


# display the audio
from IPython.display import Audio
print("concatenate prompt and generated:")
display(Audio(concated_audio, rate=codec_audio_sr))

print("generated:")
display(Audio(gen_audio, rate=codec_audio_sr))

# # save the audio
# # output_dir
# output_dir = "/home/pyp/VoiceCraft/demo/generated_tts"
# os.makedirs(output_dir, exist_ok=True)
# seg_save_fn_gen = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav"
# seg_save_fn_concat = f"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav"        

# torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)
# torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)

# if you get error importing T5 in transformers
# try 
# pip uninstall Pillow
# pip install Pillow
# you are might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored