{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "VoiceCraft Inference Text To Speech Demo\n", "===\n", "This will install a bunch of garbage all over so consider using a docker container to contain the cruft.\n", "\n", "Run the next 5 cells one at a time then change the Jupyter Notebook Kernel to use the voicecraft environment." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# install OS deps\n", "!sudo apt-get update && sudo apt-get install -y \\\n", " git-core \\\n", " ffmpeg \\\n", " espeak-ng" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Update and setup Conda voicecraft environment\n", "!conda update -y -n base -c conda-forge conda\n", "!conda create -y -n voicecraft python=3.9.16 && \\\n", " conda init bash" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# install conda and pip stuff in the activated conda above context\n", "!echo -e \"Grab a cup a coffee and a slice of pizza...\\n\\n\"\n", "\n", "# make sure $HOME and $USER are setup so this will source the conda environment\n", "!source ~/.bashrc && \\\n", " conda activate voicecraft && \\\n", " conda install -y -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 && \\\n", " pip install torch==2.0.1 && \\\n", " pip install tensorboard==2.16.2 && \\\n", " pip install phonemizer==3.2.1 && \\\n", " pip install torchaudio==2.0.2 && \\\n", " pip install datasets==2.16.0 && \\\n", " pip install torchmetrics==0.11.1\n", "\n", "# do this one last otherwise you'll get an error about torch compiler missing due to xformer mismatch\n", "!source ~/.bashrc && \\\n", " conda activate voicecraft && \\\n", " pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# okay setup the conda environment such that jupyter notebook can find the kernel\n", "!source ~/.bashrc && \\\n", " conda activate voicecraft && \\\n", " conda install -y -n voicecraft ipykernel --update-deps --force-reinstall" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# STOP\n", "You have to do this part manually using the mouse/keyboard and the tabs at the top.\n", "\n", "* Kernel -> Change Kernel -> Select Kernel -> voicecraft\n", "* Kernel -> Restart Kernel -> Yes\n", "\n", "Now you can run the rest of the notebook and get an audio sample output. It will download more models and such." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import libs\n", "# if this throws an error, something went wrong installing dependencies or changing the kernel above!\n", "import os\n", "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n", "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", "\n", "import torch\n", "import torchaudio\n", "\n", "from data.tokenizer import (\n", " AudioTokenizer,\n", " TextTokenizer,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# hyperparameters for inference\n", "left_margin = 0.08 # not used for TTS, only for speech editing\n", "right_margin = 0.08 # not used for TTS, only for speech editing\n", "codec_audio_sr = 16000\n", "codec_sr = 50\n", "top_k = 0\n", "top_p = 0.8\n", "temperature = 1\n", "kvcache = 1\n", "silence_tokens=[1388,1898,131]\n", "# adjust the below three arguments if the generation is not as good\n", "seed = 1 # random seed magic\n", "stop_repetition = 3 # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n", "sample_batch_size = 4 # if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n", "# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "# point to the original file or record the file\n", "# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n", "orig_audio = \"./demo/84_121550_000074_000000.wav\"\n", "orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n", "\n", "# move the audio and transcript to temp folder\n", "temp_folder = \"./demo/temp\"\n", "os.makedirs(temp_folder, exist_ok=True)\n", "os.system(f\"cp {orig_audio} {temp_folder}\")\n", "filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n", "with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n", " f.write(orig_transcript)\n", "# run MFA to get the alignment\n", "align_temp = f\"{temp_folder}/mfa_alignments\"\n", "os.makedirs(align_temp, exist_ok=True)\n", "\n", "# get into the conda environment and download the needed MFA models\n", "!source ~/.bashrc && \\\n", " conda activate voicecraft && \\\n", " mfa model download dictionary english_us_arpa && \\\n", " mfa model download acoustic english_us_arpa\n", "\n", "os.system(f\". ~/.bashrc && conda activate voicecraft && mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n", "\n", "# if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n", "# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n", "audio_fn = f\"{temp_folder}/{filename}.wav\"\n", "transcript_fn = f\"{temp_folder}/{filename}.txt\"\n", "align_fn = f\"{align_temp}/{filename}.csv\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n", "cut_off_sec = 3.01 # NOTE: according to forced-alignment file, the word \"common\" stop as 3.01 sec, this should be different for different audio\n", "target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n", "info = torchaudio.info(audio_fn)\n", "audio_dur = info.num_frames / info.sample_rate\n", "\n", "assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n", "prompt_end_frame = int(cut_off_sec * info.sample_rate)\n", "\n", "\n", "# # load model, tokenizer, and other necessary files\n", "from models import voicecraft\n", "#import models.voicecraft as voicecraft\n", "voicecraft_name=\"giga830M.pth\"\n", "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n", "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n", "if not os.path.exists(ckpt_fn):\n", " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n", " os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n", "if not os.path.exists(encodec_fn):\n", " os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n", " os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n", "\n", "ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n", "model = voicecraft.VoiceCraft(ckpt[\"config\"])\n", "model.load_state_dict(ckpt[\"model\"])\n", "model.to(device)\n", "model.eval()\n", "\n", "phn2num = ckpt['phn2num']\n", "\n", "text_tokenizer = TextTokenizer(backend=\"espeak\")\n", "audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n", "\n", "# run the model to get the output\n", "decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens, \"sample_batch_size\": sample_batch_size}\n", "from inference_tts_scale import inference_one_sample\n", "concated_audio, gen_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)\n", " \n", "# save segments for comparison\n", "concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()\n", "# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n", "\n", "\n", "# display the audio\n", "from IPython.display import Audio\n", "print(\"concatenate prompt and generated:\")\n", "display(Audio(concated_audio, rate=codec_audio_sr))\n", "\n", "print(\"generated:\")\n", "display(Audio(gen_audio, rate=codec_audio_sr))\n", "\n", "# # save the audio\n", "# # output_dir\n", "# output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n", "# os.makedirs(output_dir, exist_ok=True)\n", "# seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n", "# seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\" \n", "\n", "# torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n", "# torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n", "\n", "# if you get error importing T5 in transformers\n", "# try \n", "# pip uninstall Pillow\n", "# pip install Pillow\n", "# you are might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored" ] } ], "metadata": { "kernelspec": { "display_name": "voicecraft", "language": "python", "name": "voicecraft" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.19" } }, "nbformat": 4, "nbformat_minor": 4 }