diff --git a/README.md b/README.md index d05685d..f2a1df3 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,35 @@ To clone or edit an unseen voice, VoiceCraft needs only a few seconds of referen ## News :star: 03/28/2024: Model weights are up on HuggingFaceπŸ€— [here](https://huggingface.co/pyp1/VoiceCraft/tree/main)! +## QuickStart +For Linux only, or likely Windows Subsystem for Linux (WSL) ubuntu. +```bash +# 1. clone the repo on in a directory on a drive with plenty of free space +git clone git@github.com:jasonppy/VoiceCraft.git +cd VoiceCraft + +# 2. assumes you have docker installed with nvidia container container-toolkit +# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.13.5/install-guide.html +# sudo apt-get install -y nvidia-container-toolkit-base || yay -Syu nvidia-container-toolkit || echo etc... + +# 3. Try to start an existing container otherwise create a new one passing in all GPUs +./start-jupyter.sh + +# 4. now open a webpage on the host box to the URL shown at the bottom of: +docker logs jupyter + +# 5. optionally look inside from another terminal +docker exec -it jupyter /bin/bash +export USER=(your_linux_username_used_above) +export HOME=/home/$USER +sudo apt-get update + +# 6. confirm video card(s) are visible inside container +nvidia-smi + +# 7. Now in browser, open inference_tts.ipynb and work through one cell at a time +echo GOOD LUCK AND BE NICE +``` ## TODO The TODOs left will be completed by the end of March 2024. diff --git a/inference_tts.ipynb b/inference_tts.ipynb index 89be66c..8ca6c01 100644 --- a/inference_tts.ipynb +++ b/inference_tts.ipynb @@ -1,230 +1,118 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import os\n", - "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n", - "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\"" + "VoiceCraft Inference Text To Speech Demo\n", + "===\n", + "This will install a bunch of garbage all over so consider using a docker container to contain the cruft.\n", + "\n", + "Run the next 5 cells one at a time then change the Jupyter Notebook Kernel to use the voicecraft environment." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# install OS deps\n", + "!sudo apt-get update && sudo apt-get install -y \\\n", + " git-core \\\n", + " ffmpeg \\\n", + " espeak-ng" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Update and setup Conda voicecraft environment\n", + "!conda update -y -n base -c conda-forge conda\n", + "!conda create -y -n voicecraft python=3.9.16 && \\\n", + " conda init bash" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# install conda and pip stuff in the activated conda above context\n", + "!echo -e \"Grab a cup a coffee and a slice of pizza...\\n\\n\"\n", + "\n", + "# make sure $HOME and $USER are setup so this will source the conda environment\n", + "!source ~/.bashrc && \\\n", + " conda activate voicecraft && \\\n", + " conda install -y -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 && \\\n", + " pip install torch==2.0.1 && \\\n", + " pip install tensorboard==2.16.2 && \\\n", + " pip install phonemizer==3.2.1 && \\\n", + " pip install torchaudio==2.0.2 && \\\n", + " pip install datasets==2.16.0 && \\\n", + " pip install torchmetrics==0.11.1\n", + "\n", + "# do this one last otherwise you'll get an error about torch compiler missing due to xformer mismatch\n", + "!source ~/.bashrc && \\\n", + " conda activate voicecraft && \\\n", + " pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# okay setup the conda environment such that jupyter notebook can find the kernel\n", + "!source ~/.bashrc && \\\n", + " conda activate voicecraft && \\\n", + " conda install -y -n voicecraft ipykernel --update-deps --force-reinstall" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# STOP\n", + "You have to do this part manually using the mouse/keyboard and the tabs at the top.\n", + "\n", + "* Kernel -> Change Kernel -> Select Kernel -> voicecraft\n", + "* Kernel -> Restart Kernel -> Yes\n", + "\n", + "Now you can run the rest of the notebook and get an audio sample output. It will download more models and such." + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import libs\n", + "# if this throws an error, something went wrong installing dependencies or changing the kernel above!\n", + "import os\n", + "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", + "\n", "import torch\n", "import torchaudio\n", "\n", "from data.tokenizer import (\n", " AudioTokenizer,\n", " TextTokenizer,\n", - ")\n", - "\n", - "\n" + ")" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Setting up corpus information\u001b[33m...\u001b[0m \n", - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Loading corpus from source files\u001b[33m...\u001b[0m \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/100 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", - "\u001b[?25h" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Found \u001b[1;36m1\u001b[0m speaker across \u001b[1;36m1\u001b[0m file, average number of utterances per \n", - "\u001b[2;36m \u001b[0m speaker: \u001b[1;36m1.0\u001b[0m \n", - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Initializing multiprocessing jobs\u001b[33m...\u001b[0m \n", - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Normalizing text\u001b[33m...\u001b[0m \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", - "\u001b[?25h" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Creating corpus split for feature generation\u001b[33m...\u001b[0m \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/2 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", - "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating MFCCs\u001b[33m...\u001b[0m \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", - "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Calculating CMVN\u001b[33m...\u001b[0m \n", - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating final features\u001b[33m...\u001b[0m \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", - "\u001b[?25h" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Creating corpus split with features\u001b[33m...\u001b[0m \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", - "\u001b[?25h" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Compiling training graphs\u001b[33m...\u001b[0m \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", - "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Performing first-pass alignment\u001b[33m...\u001b[0m \n", - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating alignments\u001b[33m...\u001b[0m \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", - "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Calculating fMLLR for speaker adaptation\u001b[33m...\u001b[0m \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", - "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Performing second-pass alignment\u001b[33m...\u001b[0m \n", - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating alignments\u001b[33m...\u001b[0m \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", - "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Collecting phone and word alignments from alignment lattices\u001b[33m...\u001b[0m \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", - "\u001b[?25h" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[2;36m \u001b[0m\u001b[33mWARNING \u001b[0m Alignment analysis not available without using postgresql \n", - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Exporting alignment TextGrids to demo/temp/mfa_alignments\u001b[33m...\u001b[0m \n", - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Finished exporting TextGrids to demo/temp/mfa_alignments! \n", - "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Done! Everything took \u001b[1;36m40.634\u001b[0m seconds \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n", - "\u001b[?25h" - ] - } - ], + "outputs": [], "source": [ "# hyperparameters for inference\n", "left_margin = 0.08 # not used for TTS, only for speech editing\n", @@ -258,7 +146,15 @@ "# run MFA to get the alignment\n", "align_temp = f\"{temp_folder}/mfa_alignments\"\n", "os.makedirs(align_temp, exist_ok=True)\n", - "os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n", + "\n", + "# get into the conda environment and download the needed MFA models\n", + "!source ~/.bashrc && \\\n", + " conda activate voicecraft && \\\n", + " mfa model download dictionary english_us_arpa && \\\n", + " mfa model download acoustic english_us_arpa\n", + "\n", + "os.system(f\". ~/.bashrc && conda activate voicecraft && mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n", + "\n", "# if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n", "# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n", "audio_fn = f\"{temp_folder}/{filename}.wav\"\n", @@ -268,58 +164,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "concatenate prompt and generated:\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "generated:\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n", "cut_off_sec = 3.01 # NOTE: according to forced-alignment file, the word \"common\" stop as 3.01 sec, this should be different for different audio\n", @@ -333,6 +180,7 @@ "\n", "# # load model, tokenizer, and other necessary files\n", "from models import voicecraft\n", + "#import models.voicecraft as voicecraft\n", "voicecraft_name=\"giga830M.pth\"\n", "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n", "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n", @@ -388,20 +236,13 @@ "# pip install Pillow\n", "# you are might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { "display_name": "voicecraft", "language": "python", - "name": "python3" + "name": "voicecraft" }, "language_info": { "codemirror_mode": { @@ -413,9 +254,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.9.19" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/start-jupyter.sh b/start-jupyter.sh new file mode 100755 index 0000000..5888bfb --- /dev/null +++ b/start-jupyter.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +## Assumes you have docker installed with nvidia container container-toolkit +# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.13.5/install-guide.html +# sudo apt-get install -y nvidia-container-toolkit-base || yay -Syu nvidia-container-toolkit || echo etc... +## Try to start an existing container otherwise create a new one +docker start jupyter 2> /dev/null || \ +docker run -it \ + -d \ + --gpus all \ + -p 8888:8888 \ + --name jupyter \ + --user root \ + -e NB_USER="$USER" \ + -e CHOWN_HOME=yes \ + -e GRANT_SUDO=yes \ + -w "/home/${NB_USER}" \ + -v "$PWD":"/home/$USER/work" \ + jupyter/base-notebook + +## `docker logs jupyter` to get the URL link and token e.g. +## http://127.0.0.1:8888/lab?token=blahblahblahblabhlaabhalbhalbhal