VoiceCraft/inference_tts.ipynb

273 lines
12 KiB
Plaintext
Raw Normal View History

2024-03-21 19:02:20 +01:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"VoiceCraft Inference Text To Speech Demo\n",
"===\n",
"This will install a ton of dependencies all over so consider using the provided docker container start-jupyter script to keep the cruft off your dev box.\n",
"\n",
"Run the next cells one at a time up until the *STOP* and follow those instructions before continuing. You only have to do this the first time to setup the container."
]
},
2024-03-21 19:02:20 +01:00
{
"cell_type": "code",
"execution_count": null,
2024-03-21 19:02:20 +01:00
"metadata": {},
"outputs": [],
"source": [
"# install OS deps\n",
"!sudo apt-get update && sudo apt-get install -y \\\n",
" git-core \\\n",
" ffmpeg \\\n",
" espeak-ng"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Update and setup Conda voicecraft environment\n",
"!conda update -y -n base -c conda-forge conda\n",
"!conda create -y -n voicecraft python=3.9.16 && \\\n",
" conda init bash"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# install conda and pip stuff in the activated conda above context\n",
"!echo -e \"Grab a cup a coffee and a slice of pizza...\\n\\n\"\n",
"\n",
"# make sure $HOME and $USER are setup so this will source the conda environment\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" conda install -y -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 && \\\n",
" pip install torch==2.0.1 && \\\n",
" pip install tensorboard==2.16.2 && \\\n",
" pip install phonemizer==3.2.1 && \\\n",
" pip install torchaudio==2.0.2 && \\\n",
" pip install datasets==2.16.0 && \\\n",
" pip install torchmetrics==0.11.1\n",
"\n",
"# do this one last otherwise you'll get an error about torch compiler missing due to xformer mismatch\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# okay setup the conda environment such that jupyter notebook can find the kernel\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" conda install -y -n voicecraft ipykernel --update-deps --force-reinstall\n",
"\n",
"# installs the Jupyter kernel into /home/myusername/.local/share/jupyter/kernels/voicecraft\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" python3 -m ipykernel install --user --name=voicecraft"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# STOP\n",
"You have to do this part manually using the mouse/keyboard and the tabs at the top.\n",
"\n",
"* Refresh your browser to make sure it picks up the new kernel.\n",
"* Kernel -> Change Kernel -> Select Kernel -> voicecraft\n",
"* Kernel -> Restart Kernel -> Yes\n",
"\n",
"Now you can run the rest of the notebook and get an audio sample output. It will automatically download more models and such. The next time you use this container, you can just start below here as the dependencies will remain available until you delete the docker container."
2024-03-21 19:02:20 +01:00
]
},
{
"cell_type": "code",
"execution_count": null,
2024-03-21 19:02:20 +01:00
"metadata": {},
2024-03-29 00:21:30 +01:00
"outputs": [],
2024-03-21 19:02:20 +01:00
"source": [
"# import libs\n",
"# if this throws an error, something went wrong installing dependencies or changing the kernel above!\n",
"import os\n",
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
"\n",
2024-03-21 19:02:20 +01:00
"import torch\n",
"import torchaudio\n",
"\n",
"from data.tokenizer import (\n",
" AudioTokenizer,\n",
" TextTokenizer,\n",
")"
2024-03-21 19:02:20 +01:00
]
},
{
"cell_type": "code",
"execution_count": null,
2024-03-21 19:02:20 +01:00
"metadata": {},
"outputs": [],
2024-03-21 19:02:20 +01:00
"source": [
"# hyperparameters for inference\n",
2024-03-29 00:21:30 +01:00
"left_margin = 0.08 # not used for TTS, only for speech editing\n",
"right_margin = 0.08 # not used for TTS, only for speech editing\n",
2024-03-21 19:02:20 +01:00
"codec_audio_sr = 16000\n",
"codec_sr = 50\n",
"top_k = 0\n",
"top_p = 0.8\n",
"temperature = 1\n",
2024-03-29 00:21:30 +01:00
"kvcache = 1\n",
2024-03-21 19:02:20 +01:00
"silence_tokens=[1388,1898,131]\n",
2024-03-29 00:21:30 +01:00
"# adjust the below three arguments if the generation is not as good\n",
"seed = 1 # random seed magic\n",
"stop_repetition = 3 # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n",
"sample_batch_size = 4 # if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n",
2024-03-21 19:02:20 +01:00
"# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"\n",
"# point to the original file or record the file\n",
"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
2024-03-29 00:21:30 +01:00
"orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
2024-03-21 19:02:20 +01:00
"orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
"\n",
"# move the audio and transcript to temp folder\n",
2024-03-29 00:21:30 +01:00
"temp_folder = \"./demo/temp\"\n",
2024-03-21 19:02:20 +01:00
"os.makedirs(temp_folder, exist_ok=True)\n",
"os.system(f\"cp {orig_audio} {temp_folder}\")\n",
"filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",
"with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",
" f.write(orig_transcript)\n",
"# run MFA to get the alignment\n",
"align_temp = f\"{temp_folder}/mfa_alignments\"\n",
"os.makedirs(align_temp, exist_ok=True)\n",
"\n",
"# get into the conda environment and download the needed MFA models\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" mfa model download dictionary english_us_arpa && \\\n",
" mfa model download acoustic english_us_arpa\n",
"\n",
"#os.system(f\". ~/.bashrc && conda activate voicecraft && mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",
" english_us_arpa english_us_arpa {align_temp}\n",
"\n",
2024-03-21 19:02:20 +01:00
"# if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",
"# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n",
"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
"transcript_fn = f\"{temp_folder}/{filename}.txt\"\n",
"align_fn = f\"{align_temp}/{filename}.csv\""
]
},
{
"cell_type": "code",
"execution_count": null,
2024-03-21 19:02:20 +01:00
"metadata": {},
"outputs": [],
2024-03-21 19:02:20 +01:00
"source": [
"# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",
2024-03-29 00:21:30 +01:00
"cut_off_sec = 3.01 # NOTE: according to forced-alignment file, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
2024-03-21 19:02:20 +01:00
"target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",
"info = torchaudio.info(audio_fn)\n",
"audio_dur = info.num_frames / info.sample_rate\n",
"\n",
"assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n",
"prompt_end_frame = int(cut_off_sec * info.sample_rate)\n",
"\n",
"\n",
"# # load model, tokenizer, and other necessary files\n",
"from models import voicecraft\n",
"#import models.voicecraft as voicecraft\n",
2024-03-29 00:21:30 +01:00
"voicecraft_name=\"giga830M.pth\"\n",
"ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
"encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
"if not os.path.exists(ckpt_fn):\n",
" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n",
" os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n",
"if not os.path.exists(encodec_fn):\n",
" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",
" os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",
"\n",
2024-03-21 19:02:20 +01:00
"ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n",
"model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
"model.load_state_dict(ckpt[\"model\"])\n",
"model.to(device)\n",
"model.eval()\n",
"\n",
"phn2num = ckpt['phn2num']\n",
"\n",
"text_tokenizer = TextTokenizer(backend=\"espeak\")\n",
"audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n",
"\n",
"# run the model to get the output\n",
"decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens, \"sample_batch_size\": sample_batch_size}\n",
"from inference_tts_scale import inference_one_sample\n",
"concated_audio, gen_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)\n",
" \n",
"# save segments for comparison\n",
"concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()\n",
"# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n",
"\n",
"\n",
2024-03-29 00:21:30 +01:00
"# display the audio\n",
"from IPython.display import Audio\n",
"print(\"concatenate prompt and generated:\")\n",
"display(Audio(concated_audio, rate=codec_audio_sr))\n",
"\n",
"print(\"generated:\")\n",
"display(Audio(gen_audio, rate=codec_audio_sr))\n",
"\n",
"# # save the audio\n",
"# # output_dir\n",
"# output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n",
"# os.makedirs(output_dir, exist_ok=True)\n",
"# seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n",
"# seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\" \n",
2024-03-21 19:02:20 +01:00
"\n",
2024-03-29 00:21:30 +01:00
"# torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n",
"# torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n",
2024-03-21 19:02:20 +01:00
"\n",
"# if you get error importing T5 in transformers\n",
"# try \n",
"# pip uninstall Pillow\n",
"# pip install Pillow\n",
"# you are might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "voicecraft",
"language": "python",
"name": "voicecraft"
2024-03-21 19:02:20 +01:00
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
2024-03-21 19:02:20 +01:00
}
},
"nbformat": 4,
"nbformat_minor": 4
2024-03-21 19:02:20 +01:00
}