241 lines
9.7 KiB
Plaintext
241 lines
9.7 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"VoiceCraft Inference Text To Speech Demo\n",
|
|
"==="
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Select 'voicecraft' as the kernel"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# import libs\n",
|
|
"# if this throws an error, something went wrong installing dependencies or changing the kernel above!\n",
|
|
"import os\n",
|
|
"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",
|
|
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
|
|
"os.environ[\"USER\"] = \"me\" # TODO change this to your username\n",
|
|
"\n",
|
|
"import torch\n",
|
|
"import torchaudio\n",
|
|
"import numpy as np\n",
|
|
"import random\n",
|
|
"from argparse import Namespace\n",
|
|
"\n",
|
|
"from data.tokenizer import (\n",
|
|
" AudioTokenizer,\n",
|
|
" TextTokenizer,\n",
|
|
")\n",
|
|
"from huggingface_hub import hf_hub_download"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# # install MFA models and dictionaries if you haven't done so already, already done in the dockerfile or envrionment setup\n",
|
|
"# !source ~/.bashrc && \\\n",
|
|
"# conda activate voicecraft && \\\n",
|
|
"# mfa model download dictionary english_us_arpa && \\\n",
|
|
"# mfa model download acoustic english_us_arpa"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Dora directory: /tmp/audiocraft_me\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# load model, encodec, and phn2num\n",
|
|
"# # load model, tokenizer, and other necessary files\n",
|
|
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
|
|
"voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n",
|
|
"\n",
|
|
"# the new way of loading the model, with huggingface, recommended\n",
|
|
"from models.voicecraft import VoiceCraftHF\n",
|
|
"model = VoiceCraftHF.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n",
|
|
"phn2num = model.args.phn2num\n",
|
|
"config = vars(model.args)\n",
|
|
"model.to(device)\n",
|
|
"\n",
|
|
"\n",
|
|
"# # the old way of loading the model\n",
|
|
"# from models import voicecraft\n",
|
|
"# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n",
|
|
"# ckpt = torch.load(filepath, map_location=\"cpu\")\n",
|
|
"# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
|
|
"# model.load_state_dict(ckpt[\"model\"])\n",
|
|
"# config = vars(model.args)\n",
|
|
"# phn2num = ckpt[\"phn2num\"]\n",
|
|
"# model.to(device)\n",
|
|
"# model.eval()\n",
|
|
"\n",
|
|
"\n",
|
|
"encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
|
|
"if not os.path.exists(encodec_fn):\n",
|
|
" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",
|
|
" os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",
|
|
"audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device) # will also put the neural codec model on gpu\n",
|
|
"\n",
|
|
"text_tokenizer = TextTokenizer(backend=\"espeak\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Prepare your audio\n",
|
|
"# point to the original audio whose speech you want to clone\n",
|
|
"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
|
|
"orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
|
|
"orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
|
|
"\n",
|
|
"# move the audio and transcript to temp folder\n",
|
|
"temp_folder = \"./demo/temp\"\n",
|
|
"os.makedirs(temp_folder, exist_ok=True)\n",
|
|
"os.system(f\"cp {orig_audio} {temp_folder}\")\n",
|
|
"filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",
|
|
"with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",
|
|
" f.write(orig_transcript)\n",
|
|
"# run MFA to get the alignment\n",
|
|
"align_temp = f\"{temp_folder}/mfa_alignments\"\n",
|
|
"!source ~/.bashrc && \\\n",
|
|
" conda activate voicecraft && \\\n",
|
|
" mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",
|
|
" english_us_arpa english_us_arpa {align_temp}\n",
|
|
"\n",
|
|
"# # if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",
|
|
"# !source ~/.bashrc && \\\n",
|
|
"# conda activate voicecraft && \\\n",
|
|
"# mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",
|
|
"# english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",
|
|
"cut_off_sec = 3.01 # NOTE: according to forced-alignment file demo/temp/mfa_alignments/84_121550_000074_000000.csv, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
|
|
"target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",
|
|
"# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.\n",
|
|
"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
|
|
"info = torchaudio.info(audio_fn)\n",
|
|
"audio_dur = info.num_frames / info.sample_rate\n",
|
|
"\n",
|
|
"assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n",
|
|
"prompt_end_frame = int(cut_off_sec * info.sample_rate)\n",
|
|
"\n",
|
|
"# run the model to get the output\n",
|
|
"# hyperparameters for inference\n",
|
|
"codec_audio_sr = 16000\n",
|
|
"codec_sr = 50\n",
|
|
"top_k = 0\n",
|
|
"top_p = 0.9 # can also try 0.8, but 0.9 seems to work better\n",
|
|
"temperature = 1\n",
|
|
"silence_tokens=[1388,1898,131]\n",
|
|
"kvcache = 1 # NOTE if OOM, change this to 0, or try the 330M model\n",
|
|
"\n",
|
|
"# NOTE adjust the below three arguments if the generation is not as good\n",
|
|
"stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1\n",
|
|
"sample_batch_size = 5 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
|
|
"seed = 1 # change seed if you are still unhappy with the result\n",
|
|
"\n",
|
|
"def seed_everything(seed):\n",
|
|
" os.environ['PYTHONHASHSEED'] = str(seed)\n",
|
|
" random.seed(seed)\n",
|
|
" np.random.seed(seed)\n",
|
|
" torch.manual_seed(seed)\n",
|
|
" torch.cuda.manual_seed(seed)\n",
|
|
" torch.backends.cudnn.benchmark = False\n",
|
|
" torch.backends.cudnn.deterministic = True\n",
|
|
"seed_everything(seed)\n",
|
|
"\n",
|
|
"decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens, \"sample_batch_size\": sample_batch_size}\n",
|
|
"from inference_tts_scale import inference_one_sample\n",
|
|
"concated_audio, gen_audio = inference_one_sample(model, Namespace(**config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)\n",
|
|
" \n",
|
|
"# save segments for comparison\n",
|
|
"concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()\n",
|
|
"# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n",
|
|
"\n",
|
|
"\n",
|
|
"# display the audio\n",
|
|
"from IPython.display import Audio\n",
|
|
"print(\"concatenate prompt and generated:\")\n",
|
|
"display(Audio(concated_audio, rate=codec_audio_sr))\n",
|
|
"\n",
|
|
"print(\"generated:\")\n",
|
|
"display(Audio(gen_audio, rate=codec_audio_sr))\n",
|
|
"\n",
|
|
"# # save the audio\n",
|
|
"# # output_dir\n",
|
|
"# output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n",
|
|
"# os.makedirs(output_dir, exist_ok=True)\n",
|
|
"# seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n",
|
|
"# seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\" \n",
|
|
"\n",
|
|
"# torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n",
|
|
"# torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n",
|
|
"\n",
|
|
"# you are might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "voicecraft",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.18"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|