diff --git a/README.md b/README.md
index d05685d..f2a1df3 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,35 @@ To clone or edit an unseen voice, VoiceCraft needs only a few seconds of referen
## News
:star: 03/28/2024: Model weights are up on HuggingFaceπ€ [here](https://huggingface.co/pyp1/VoiceCraft/tree/main)!
+## QuickStart
+For Linux only, or likely Windows Subsystem for Linux (WSL) ubuntu.
+```bash
+# 1. clone the repo on in a directory on a drive with plenty of free space
+git clone git@github.com:jasonppy/VoiceCraft.git
+cd VoiceCraft
+
+# 2. assumes you have docker installed with nvidia container container-toolkit
+# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.13.5/install-guide.html
+# sudo apt-get install -y nvidia-container-toolkit-base || yay -Syu nvidia-container-toolkit || echo etc...
+
+# 3. Try to start an existing container otherwise create a new one passing in all GPUs
+./start-jupyter.sh
+
+# 4. now open a webpage on the host box to the URL shown at the bottom of:
+docker logs jupyter
+
+# 5. optionally look inside from another terminal
+docker exec -it jupyter /bin/bash
+export USER=(your_linux_username_used_above)
+export HOME=/home/$USER
+sudo apt-get update
+
+# 6. confirm video card(s) are visible inside container
+nvidia-smi
+
+# 7. Now in browser, open inference_tts.ipynb and work through one cell at a time
+echo GOOD LUCK AND BE NICE
+```
## TODO
The TODOs left will be completed by the end of March 2024.
diff --git a/inference_tts.ipynb b/inference_tts.ipynb
index 89be66c..8ca6c01 100644
--- a/inference_tts.ipynb
+++ b/inference_tts.ipynb
@@ -1,230 +1,118 @@
{
"cells": [
{
- "cell_type": "code",
- "execution_count": 1,
+ "cell_type": "markdown",
"metadata": {},
- "outputs": [],
"source": [
- "import os\n",
- "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",
- "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\""
+ "VoiceCraft Inference Text To Speech Demo\n",
+ "===\n",
+ "This will install a bunch of garbage all over so consider using a docker container to contain the cruft.\n",
+ "\n",
+ "Run the next 5 cells one at a time then change the Jupyter Notebook Kernel to use the voicecraft environment."
]
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# install OS deps\n",
+ "!sudo apt-get update && sudo apt-get install -y \\\n",
+ " git-core \\\n",
+ " ffmpeg \\\n",
+ " espeak-ng"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Update and setup Conda voicecraft environment\n",
+ "!conda update -y -n base -c conda-forge conda\n",
+ "!conda create -y -n voicecraft python=3.9.16 && \\\n",
+ " conda init bash"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# install conda and pip stuff in the activated conda above context\n",
+ "!echo -e \"Grab a cup a coffee and a slice of pizza...\\n\\n\"\n",
+ "\n",
+ "# make sure $HOME and $USER are setup so this will source the conda environment\n",
+ "!source ~/.bashrc && \\\n",
+ " conda activate voicecraft && \\\n",
+ " conda install -y -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 && \\\n",
+ " pip install torch==2.0.1 && \\\n",
+ " pip install tensorboard==2.16.2 && \\\n",
+ " pip install phonemizer==3.2.1 && \\\n",
+ " pip install torchaudio==2.0.2 && \\\n",
+ " pip install datasets==2.16.0 && \\\n",
+ " pip install torchmetrics==0.11.1\n",
+ "\n",
+ "# do this one last otherwise you'll get an error about torch compiler missing due to xformer mismatch\n",
+ "!source ~/.bashrc && \\\n",
+ " conda activate voicecraft && \\\n",
+ " pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# okay setup the conda environment such that jupyter notebook can find the kernel\n",
+ "!source ~/.bashrc && \\\n",
+ " conda activate voicecraft && \\\n",
+ " conda install -y -n voicecraft ipykernel --update-deps --force-reinstall"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# STOP\n",
+ "You have to do this part manually using the mouse/keyboard and the tabs at the top.\n",
+ "\n",
+ "* Kernel -> Change Kernel -> Select Kernel -> voicecraft\n",
+ "* Kernel -> Restart Kernel -> Yes\n",
+ "\n",
+ "Now you can run the rest of the notebook and get an audio sample output. It will download more models and such."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import libs\n",
+ "# if this throws an error, something went wrong installing dependencies or changing the kernel above!\n",
+ "import os\n",
+ "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",
+ "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
+ "\n",
"import torch\n",
"import torchaudio\n",
"\n",
"from data.tokenizer import (\n",
" AudioTokenizer,\n",
" TextTokenizer,\n",
- ")\n",
- "\n",
- "\n"
+ ")"
]
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Setting up corpus information\u001b[33m...\u001b[0m \n",
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Loading corpus from source files\u001b[33m...\u001b[0m \n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m0/100 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
- "\u001b[?25h"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Found \u001b[1;36m1\u001b[0m speaker across \u001b[1;36m1\u001b[0m file, average number of utterances per \n",
- "\u001b[2;36m \u001b[0m speaker: \u001b[1;36m1.0\u001b[0m \n",
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Initializing multiprocessing jobs\u001b[33m...\u001b[0m \n",
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Normalizing text\u001b[33m...\u001b[0m \n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
- "\u001b[?25h"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Creating corpus split for feature generation\u001b[33m...\u001b[0m \n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m0/2 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
- "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating MFCCs\u001b[33m...\u001b[0m \n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
- "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Calculating CMVN\u001b[33m...\u001b[0m \n",
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating final features\u001b[33m...\u001b[0m \n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
- "\u001b[?25h"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Creating corpus split with features\u001b[33m...\u001b[0m \n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
- "\u001b[?25h"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Compiling training graphs\u001b[33m...\u001b[0m \n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
- "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Performing first-pass alignment\u001b[33m...\u001b[0m \n",
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating alignments\u001b[33m...\u001b[0m \n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
- "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Calculating fMLLR for speaker adaptation\u001b[33m...\u001b[0m \n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
- "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Performing second-pass alignment\u001b[33m...\u001b[0m \n",
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Generating alignments\u001b[33m...\u001b[0m \n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
- "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Collecting phone and word alignments from alignment lattices\u001b[33m...\u001b[0m \n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2K\u001b[35m 100%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m1/1 \u001b[0m [ \u001b[33m0:00:01\u001b[0m < \u001b[36m0:00:00\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
- "\u001b[?25h"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[2;36m \u001b[0m\u001b[33mWARNING \u001b[0m Alignment analysis not available without using postgresql \n",
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Exporting alignment TextGrids to demo/temp/mfa_alignments\u001b[33m...\u001b[0m \n",
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Finished exporting TextGrids to demo/temp/mfa_alignments! \n",
- "\u001b[2;36m \u001b[0m\u001b[32mINFO \u001b[0m Done! Everything took \u001b[1;36m40.634\u001b[0m seconds \n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[2K\u001b[35m 0%\u001b[0m \u001b[90mββββββββββββββββββββββββββββββββββββββ\u001b[0m \u001b[32m0/1 \u001b[0m [ \u001b[33m0:00:00\u001b[0m < \u001b[36m-:--:--\u001b[0m , \u001b[31m? it/s\u001b[0m ]\n",
- "\u001b[?25h"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# hyperparameters for inference\n",
"left_margin = 0.08 # not used for TTS, only for speech editing\n",
@@ -258,7 +146,15 @@
"# run MFA to get the alignment\n",
"align_temp = f\"{temp_folder}/mfa_alignments\"\n",
"os.makedirs(align_temp, exist_ok=True)\n",
- "os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n",
+ "\n",
+ "# get into the conda environment and download the needed MFA models\n",
+ "!source ~/.bashrc && \\\n",
+ " conda activate voicecraft && \\\n",
+ " mfa model download dictionary english_us_arpa && \\\n",
+ " mfa model download acoustic english_us_arpa\n",
+ "\n",
+ "os.system(f\". ~/.bashrc && conda activate voicecraft && mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n",
+ "\n",
"# if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",
"# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n",
"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
@@ -268,58 +164,9 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "concatenate prompt and generated:\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "generated:\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- " \n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",
"cut_off_sec = 3.01 # NOTE: according to forced-alignment file, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
@@ -333,6 +180,7 @@
"\n",
"# # load model, tokenizer, and other necessary files\n",
"from models import voicecraft\n",
+ "#import models.voicecraft as voicecraft\n",
"voicecraft_name=\"giga830M.pth\"\n",
"ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
"encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
@@ -388,20 +236,13 @@
"# pip install Pillow\n",
"# you are might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "voicecraft",
"language": "python",
- "name": "python3"
+ "name": "voicecraft"
},
"language_info": {
"codemirror_mode": {
@@ -413,9 +254,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.18"
+ "version": "3.9.19"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/start-jupyter.sh b/start-jupyter.sh
new file mode 100755
index 0000000..5888bfb
--- /dev/null
+++ b/start-jupyter.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+## Assumes you have docker installed with nvidia container container-toolkit
+# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/1.13.5/install-guide.html
+# sudo apt-get install -y nvidia-container-toolkit-base || yay -Syu nvidia-container-toolkit || echo etc...
+## Try to start an existing container otherwise create a new one
+docker start jupyter 2> /dev/null || \
+docker run -it \
+ -d \
+ --gpus all \
+ -p 8888:8888 \
+ --name jupyter \
+ --user root \
+ -e NB_USER="$USER" \
+ -e CHOWN_HOME=yes \
+ -e GRANT_SUDO=yes \
+ -w "/home/${NB_USER}" \
+ -v "$PWD":"/home/$USER/work" \
+ jupyter/base-notebook
+
+## `docker logs jupyter` to get the URL link and token e.g.
+## http://127.0.0.1:8888/lab?token=blahblahblahblabhlaabhalbhalbhal