docker for inference, works on linux and windows

This commit is contained in:
jason-on-salt-a40
2024-03-30 11:32:09 -07:00
parent c54b343bfb
commit 741a6559e9
4 changed files with 80 additions and 62 deletions

2
.gitignore vendored
View File

@@ -23,5 +23,7 @@ thumbs.db
*l40* *l40*
*a40* *a40*
src/audiocraft
!/demo/ !/demo/
!/demo/* !/demo/*

View File

@@ -11,6 +11,8 @@ To clone or edit an unseen voice, VoiceCraft needs only a few seconds of referen
:star: 03/28/2024: Model weights are up on HuggingFace🤗 [here](https://huggingface.co/pyp1/VoiceCraft/tree/main)! :star: 03/28/2024: Model weights are up on HuggingFace🤗 [here](https://huggingface.co/pyp1/VoiceCraft/tree/main)!
## QuickStart ## QuickStart
:star: To try out TTS inference with VoiceCraft, the best way is using docker. Thank [@ubergarm](https://github.com/ubergarm) and [@jayc88](https://github.com/jay-c88) for making this happen.
Tested on Linux and Windows and should work with any host with docker installed. Tested on Linux and Windows and should work with any host with docker installed.
```bash ```bash
# 1. clone the repo on in a directory on a drive with plenty of free space # 1. clone the repo on in a directory on a drive with plenty of free space
@@ -38,11 +40,10 @@ sudo apt-get update
nvidia-smi nvidia-smi
# 7. Now in browser, open inference_tts.ipynb and work through one cell at a time # 7. Now in browser, open inference_tts.ipynb and work through one cell at a time
echo GOOD LUCK AND BE NICE echo GOOD LUCK
``` ```
## TODO ## TODO
The TODOs left will be completed by the end of March 2024.
- [x] Codebase upload - [x] Codebase upload
- [x] Environment setup - [x] Environment setup
- [x] Inference demo for speech editing and TTS - [x] Inference demo for speech editing and TTS

View File

@@ -8,7 +8,8 @@
"source": [ "source": [
"import os\n", "import os\n",
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n", "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\"" "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\"\n",
"os.environ[\"USER\"] = \"YOUR_USERNAME\" # TODO change this to your username"
] ]
}, },
{ {

View File

@@ -104,6 +104,7 @@
"import os\n", "import os\n",
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n", "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
"os.environ[\"USER\"] = \"YOUR_USERNAME\" # TODO change this to your username\n",
"\n", "\n",
"import torch\n", "import torch\n",
"import torchaudio\n", "import torchaudio\n",
@@ -120,56 +121,11 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# hyperparameters for inference\n", "# install MFA models and dictionaries if you haven't done so already\n",
"left_margin = 0.08 # not used for TTS, only for speech editing\n",
"right_margin = 0.08 # not used for TTS, only for speech editing\n",
"codec_audio_sr = 16000\n",
"codec_sr = 50\n",
"top_k = 0\n",
"top_p = 0.8\n",
"temperature = 1\n",
"kvcache = 1\n",
"silence_tokens=[1388,1898,131]\n",
"# adjust the below three arguments if the generation is not as good\n",
"seed = 1 # random seed magic\n",
"stop_repetition = 3 # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n",
"sample_batch_size = 4 # if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n",
"# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"\n",
"# point to the original file or record the file\n",
"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
"orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
"orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
"\n",
"# move the audio and transcript to temp folder\n",
"temp_folder = \"./demo/temp\"\n",
"os.makedirs(temp_folder, exist_ok=True)\n",
"os.system(f\"cp {orig_audio} {temp_folder}\")\n",
"filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",
"with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",
" f.write(orig_transcript)\n",
"# run MFA to get the alignment\n",
"align_temp = f\"{temp_folder}/mfa_alignments\"\n",
"os.makedirs(align_temp, exist_ok=True)\n",
"\n",
"# get into the conda environment and download the needed MFA models\n",
"!source ~/.bashrc && \\\n", "!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n", " conda activate voicecraft && \\\n",
" mfa model download dictionary english_us_arpa && \\\n", " mfa model download dictionary english_us_arpa && \\\n",
" mfa model download acoustic english_us_arpa\n", " mfa model download acoustic english_us_arpa"
"\n",
"#os.system(f\". ~/.bashrc && conda activate voicecraft && mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",
" english_us_arpa english_us_arpa {align_temp}\n",
"\n",
"# if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",
"# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n",
"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
"transcript_fn = f\"{temp_folder}/{filename}.txt\"\n",
"align_fn = f\"{align_temp}/{filename}.csv\""
] ]
}, },
{ {
@@ -178,20 +134,12 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n", "# load model, encodec, and phn2num\n",
"cut_off_sec = 3.01 # NOTE: according to forced-alignment file, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
"target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",
"info = torchaudio.info(audio_fn)\n",
"audio_dur = info.num_frames / info.sample_rate\n",
"\n",
"assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n",
"prompt_end_frame = int(cut_off_sec * info.sample_rate)\n",
"\n",
"\n",
"# # load model, tokenizer, and other necessary files\n", "# # load model, tokenizer, and other necessary files\n",
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
"from models import voicecraft\n", "from models import voicecraft\n",
"#import models.voicecraft as voicecraft\n", "#import models.voicecraft as voicecraft\n",
"voicecraft_name=\"giga830M.pth\"\n", "voicecraft_name=\"giga830M.pth\" # or giga330M.pth\n",
"ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n", "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
"encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n", "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
"if not os.path.exists(ckpt_fn):\n", "if not os.path.exists(ckpt_fn):\n",
@@ -210,9 +158,75 @@
"phn2num = ckpt['phn2num']\n", "phn2num = ckpt['phn2num']\n",
"\n", "\n",
"text_tokenizer = TextTokenizer(backend=\"espeak\")\n", "text_tokenizer = TextTokenizer(backend=\"espeak\")\n",
"audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n", "audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device) # will also put the neural codec model on gpu\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Prepare your audio\n",
"# point to the original audio whose speech you want to clone\n",
"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
"orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
"orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
"\n",
"# move the audio and transcript to temp folder\n",
"temp_folder = \"./demo/temp\"\n",
"os.makedirs(temp_folder, exist_ok=True)\n",
"os.system(f\"cp {orig_audio} {temp_folder}\")\n",
"filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",
"with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",
" f.write(orig_transcript)\n",
"# run MFA to get the alignment\n",
"align_temp = f\"{temp_folder}/mfa_alignments\"\n",
"!source ~/.bashrc && \\\n",
" conda activate voicecraft && \\\n",
" mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",
" english_us_arpa english_us_arpa {align_temp}\n",
"\n",
"# # if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",
"# !source ~/.bashrc && \\\n",
"# conda activate voicecraft && \\\n",
"# mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",
"# english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",
"cut_off_sec = 3.01 # NOTE: according to forced-alignment file demo/temp/mfa_alignments/84_121550_000074_000000.csv, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
"target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",
"# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.\n",
"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
"info = torchaudio.info(audio_fn)\n",
"audio_dur = info.num_frames / info.sample_rate\n",
"\n",
"assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n",
"prompt_end_frame = int(cut_off_sec * info.sample_rate)\n",
"\n", "\n",
"# run the model to get the output\n", "# run the model to get the output\n",
"# hyperparameters for inference\n",
"codec_audio_sr = 16000\n",
"codec_sr = 50\n",
"top_k = 0\n",
"top_p = 0.8\n",
"temperature = 1\n",
"silence_tokens=[1388,1898,131]\n",
"kvcache = 1 # NOTE if OOM, change this to 0, or try the 330M model\n",
"\n",
"# NOTE adjust the below three arguments if the generation is not as good\n",
"stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1\n",
"sample_batch_size = 4 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
"seed = 1 # change seed if you are still unhappy with the result\n",
"\n",
"decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens, \"sample_batch_size\": sample_batch_size}\n", "decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens, \"sample_batch_size\": sample_batch_size}\n",
"from inference_tts_scale import inference_one_sample\n", "from inference_tts_scale import inference_one_sample\n",
"concated_audio, gen_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)\n", "concated_audio, gen_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)\n",