docker for inference, works on linux and windows
This commit is contained in:
parent
c54b343bfb
commit
741a6559e9
|
@ -23,5 +23,7 @@ thumbs.db
|
|||
*l40*
|
||||
*a40*
|
||||
|
||||
src/audiocraft
|
||||
|
||||
!/demo/
|
||||
!/demo/*
|
|
@ -11,6 +11,8 @@ To clone or edit an unseen voice, VoiceCraft needs only a few seconds of referen
|
|||
:star: 03/28/2024: Model weights are up on HuggingFace🤗 [here](https://huggingface.co/pyp1/VoiceCraft/tree/main)!
|
||||
|
||||
## QuickStart
|
||||
:star: To try out TTS inference with VoiceCraft, the best way is using docker. Thank [@ubergarm](https://github.com/ubergarm) and [@jayc88](https://github.com/jay-c88) for making this happen.
|
||||
|
||||
Tested on Linux and Windows and should work with any host with docker installed.
|
||||
```bash
|
||||
# 1. clone the repo on in a directory on a drive with plenty of free space
|
||||
|
@ -38,11 +40,10 @@ sudo apt-get update
|
|||
nvidia-smi
|
||||
|
||||
# 7. Now in browser, open inference_tts.ipynb and work through one cell at a time
|
||||
echo GOOD LUCK AND BE NICE
|
||||
echo GOOD LUCK
|
||||
```
|
||||
|
||||
## TODO
|
||||
The TODOs left will be completed by the end of March 2024.
|
||||
- [x] Codebase upload
|
||||
- [x] Environment setup
|
||||
- [x] Inference demo for speech editing and TTS
|
||||
|
|
|
@ -8,7 +8,8 @@
|
|||
"source": [
|
||||
"import os\n",
|
||||
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",
|
||||
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\""
|
||||
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\"\n",
|
||||
"os.environ[\"USER\"] = \"YOUR_USERNAME\" # TODO change this to your username"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -104,6 +104,7 @@
|
|||
"import os\n",
|
||||
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",
|
||||
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
|
||||
"os.environ[\"USER\"] = \"YOUR_USERNAME\" # TODO change this to your username\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torchaudio\n",
|
||||
|
@ -120,56 +121,11 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# hyperparameters for inference\n",
|
||||
"left_margin = 0.08 # not used for TTS, only for speech editing\n",
|
||||
"right_margin = 0.08 # not used for TTS, only for speech editing\n",
|
||||
"codec_audio_sr = 16000\n",
|
||||
"codec_sr = 50\n",
|
||||
"top_k = 0\n",
|
||||
"top_p = 0.8\n",
|
||||
"temperature = 1\n",
|
||||
"kvcache = 1\n",
|
||||
"silence_tokens=[1388,1898,131]\n",
|
||||
"# adjust the below three arguments if the generation is not as good\n",
|
||||
"seed = 1 # random seed magic\n",
|
||||
"stop_repetition = 3 # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n",
|
||||
"sample_batch_size = 4 # if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n",
|
||||
"# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest\n",
|
||||
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"\n",
|
||||
"# point to the original file or record the file\n",
|
||||
"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
|
||||
"orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
|
||||
"orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
|
||||
"\n",
|
||||
"# move the audio and transcript to temp folder\n",
|
||||
"temp_folder = \"./demo/temp\"\n",
|
||||
"os.makedirs(temp_folder, exist_ok=True)\n",
|
||||
"os.system(f\"cp {orig_audio} {temp_folder}\")\n",
|
||||
"filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",
|
||||
"with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",
|
||||
" f.write(orig_transcript)\n",
|
||||
"# run MFA to get the alignment\n",
|
||||
"align_temp = f\"{temp_folder}/mfa_alignments\"\n",
|
||||
"os.makedirs(align_temp, exist_ok=True)\n",
|
||||
"\n",
|
||||
"# get into the conda environment and download the needed MFA models\n",
|
||||
"# install MFA models and dictionaries if you haven't done so already\n",
|
||||
"!source ~/.bashrc && \\\n",
|
||||
" conda activate voicecraft && \\\n",
|
||||
" mfa model download dictionary english_us_arpa && \\\n",
|
||||
" mfa model download acoustic english_us_arpa\n",
|
||||
"\n",
|
||||
"#os.system(f\". ~/.bashrc && conda activate voicecraft && mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n",
|
||||
"!source ~/.bashrc && \\\n",
|
||||
" conda activate voicecraft && \\\n",
|
||||
" mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",
|
||||
" english_us_arpa english_us_arpa {align_temp}\n",
|
||||
"\n",
|
||||
"# if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",
|
||||
"# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n",
|
||||
"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
|
||||
"transcript_fn = f\"{temp_folder}/{filename}.txt\"\n",
|
||||
"align_fn = f\"{align_temp}/{filename}.csv\""
|
||||
" mfa model download acoustic english_us_arpa"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -178,20 +134,12 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",
|
||||
"cut_off_sec = 3.01 # NOTE: according to forced-alignment file, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
|
||||
"target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",
|
||||
"info = torchaudio.info(audio_fn)\n",
|
||||
"audio_dur = info.num_frames / info.sample_rate\n",
|
||||
"\n",
|
||||
"assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n",
|
||||
"prompt_end_frame = int(cut_off_sec * info.sample_rate)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# load model, encodec, and phn2num\n",
|
||||
"# # load model, tokenizer, and other necessary files\n",
|
||||
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
||||
"from models import voicecraft\n",
|
||||
"#import models.voicecraft as voicecraft\n",
|
||||
"voicecraft_name=\"giga830M.pth\"\n",
|
||||
"voicecraft_name=\"giga830M.pth\" # or giga330M.pth\n",
|
||||
"ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
|
||||
"encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
|
||||
"if not os.path.exists(ckpt_fn):\n",
|
||||
|
@ -210,9 +158,75 @@
|
|||
"phn2num = ckpt['phn2num']\n",
|
||||
"\n",
|
||||
"text_tokenizer = TextTokenizer(backend=\"espeak\")\n",
|
||||
"audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n",
|
||||
"audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device) # will also put the neural codec model on gpu\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Prepare your audio\n",
|
||||
"# point to the original audio whose speech you want to clone\n",
|
||||
"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
|
||||
"orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
|
||||
"orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
|
||||
"\n",
|
||||
"# move the audio and transcript to temp folder\n",
|
||||
"temp_folder = \"./demo/temp\"\n",
|
||||
"os.makedirs(temp_folder, exist_ok=True)\n",
|
||||
"os.system(f\"cp {orig_audio} {temp_folder}\")\n",
|
||||
"filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",
|
||||
"with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",
|
||||
" f.write(orig_transcript)\n",
|
||||
"# run MFA to get the alignment\n",
|
||||
"align_temp = f\"{temp_folder}/mfa_alignments\"\n",
|
||||
"!source ~/.bashrc && \\\n",
|
||||
" conda activate voicecraft && \\\n",
|
||||
" mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",
|
||||
" english_us_arpa english_us_arpa {align_temp}\n",
|
||||
"\n",
|
||||
"# # if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",
|
||||
"# !source ~/.bashrc && \\\n",
|
||||
"# conda activate voicecraft && \\\n",
|
||||
"# mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",
|
||||
"# english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",
|
||||
"cut_off_sec = 3.01 # NOTE: according to forced-alignment file demo/temp/mfa_alignments/84_121550_000074_000000.csv, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
|
||||
"target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",
|
||||
"# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.\n",
|
||||
"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
|
||||
"info = torchaudio.info(audio_fn)\n",
|
||||
"audio_dur = info.num_frames / info.sample_rate\n",
|
||||
"\n",
|
||||
"assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n",
|
||||
"prompt_end_frame = int(cut_off_sec * info.sample_rate)\n",
|
||||
"\n",
|
||||
"# run the model to get the output\n",
|
||||
"# hyperparameters for inference\n",
|
||||
"codec_audio_sr = 16000\n",
|
||||
"codec_sr = 50\n",
|
||||
"top_k = 0\n",
|
||||
"top_p = 0.8\n",
|
||||
"temperature = 1\n",
|
||||
"silence_tokens=[1388,1898,131]\n",
|
||||
"kvcache = 1 # NOTE if OOM, change this to 0, or try the 330M model\n",
|
||||
"\n",
|
||||
"# NOTE adjust the below three arguments if the generation is not as good\n",
|
||||
"stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1\n",
|
||||
"sample_batch_size = 4 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
|
||||
"seed = 1 # change seed if you are still unhappy with the result\n",
|
||||
"\n",
|
||||
"decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens, \"sample_batch_size\": sample_batch_size}\n",
|
||||
"from inference_tts_scale import inference_one_sample\n",
|
||||
"concated_audio, gen_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)\n",
|
||||
|
|
Loading…
Reference in New Issue