VoiceCraft/inference_tts.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "VoiceCraft Inference Text To Speech Demo\n",
    "==="
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Select 'voicecraft' as the kernel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import libs\n",
    "# if this throws an error, something went wrong installing dependencies or changing the kernel above!\n",
    "import os\n",
    "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"   \n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
    "os.environ[\"USER\"] = \"me\" # TODO change this to your username\n",
    "\n",
    "import torch\n",
    "import torchaudio\n",
    "import numpy as np\n",
    "import random\n",
    "from argparse import Namespace\n",
    "\n",
    "from data.tokenizer import (\n",
    "    AudioTokenizer,\n",
    "    TextTokenizer,\n",
    ")\n",
    "from huggingface_hub import hf_hub_download"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # install MFA models and dictionaries if you haven't done so already, already done in the dockerfile or envrionment setup\n",
    "# !source ~/.bashrc && \\\n",
    "#     conda activate voicecraft && \\\n",
    "#     mfa model download dictionary english_us_arpa && \\\n",
    "#     mfa model download acoustic english_us_arpa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Dora directory: /tmp/audiocraft_me\n"
     ]
    }
   ],
   "source": [
    "# load model, encodec, and phn2num\n",
    "# # load model, tokenizer, and other necessary files\n",
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n",
    "\n",
    "# the new way of loading the model, with huggingface, recommended\n",
    "from models import voicecraft\n",
    "model = voicecraft.VoiceCraft.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n",
    "phn2num = model.args.phn2num\n",
    "config = vars(model.args)\n",
    "model.to(device)\n",
    "\n",
    "\n",
    "# # the old way of loading the model\n",
    "# from models import voicecraft\n",
    "# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n",
    "# ckpt = torch.load(filepath, map_location=\"cpu\")\n",
    "# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
    "# model.load_state_dict(ckpt[\"model\"])\n",
    "# config = vars(model.args)\n",
    "# phn2num = ckpt[\"phn2num\"]\n",
    "# model.to(device)\n",
    "# model.eval()\n",
    "\n",
    "\n",
    "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
    "if not os.path.exists(encodec_fn):\n",
    "    os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",
    "    os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",
    "audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device) # will also put the neural codec model on gpu\n",
    "\n",
    "text_tokenizer = TextTokenizer(backend=\"espeak\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare your audio\n",
    "# point to the original audio whose speech you want to clone\n",
    "# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
    "orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
    "orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
    "\n",
    "# move the audio and transcript to temp folder\n",
    "temp_folder = \"./demo/temp\"\n",
    "os.makedirs(temp_folder, exist_ok=True)\n",
    "os.system(f\"cp {orig_audio} {temp_folder}\")\n",
    "filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",
    "with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",
    "    f.write(orig_transcript)\n",
    "# run MFA to get the alignment\n",
    "align_temp = f\"{temp_folder}/mfa_alignments\"\n",
    "!source ~/.bashrc && \\\n",
    "    conda activate voicecraft && \\\n",
    "    mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",
    "        english_us_arpa english_us_arpa {align_temp}\n",
    "\n",
    "# # if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",
    "# !source ~/.bashrc && \\\n",
    "#     conda activate voicecraft && \\\n",
    "#     mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",
    "#         english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",
    "cut_off_sec = 3.01 # NOTE: according to forced-alignment file demo/temp/mfa_alignments/84_121550_000074_000000.csv, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
    "target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",
    "# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.\n",
    "audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
    "info = torchaudio.info(audio_fn)\n",
    "audio_dur = info.num_frames / info.sample_rate\n",
    "\n",
    "assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n",
    "prompt_end_frame = int(cut_off_sec * info.sample_rate)\n",
    "\n",
    "# run the model to get the output\n",
    "# hyperparameters for inference\n",
    "codec_audio_sr = 16000\n",
    "codec_sr = 50\n",
    "top_k = 0\n",
    "top_p = 0.9 # can also try 0.8, but 0.9 seems to work better\n",
    "temperature = 1\n",
    "silence_tokens=[1388,1898,131]\n",
    "kvcache = 1 # NOTE if OOM, change this to 0, or try the 330M model\n",
    "\n",
    "# NOTE adjust the below three arguments if the generation is not as good\n",
    "stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1\n",
    "sample_batch_size = 5 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",
    "seed = 1 # change seed if you are still unhappy with the result\n",
    "\n",
    "def seed_everything(seed):\n",
    "    os.environ['PYTHONHASHSEED'] = str(seed)\n",
    "    random.seed(seed)\n",
    "    np.random.seed(seed)\n",
    "    torch.manual_seed(seed)\n",
    "    torch.cuda.manual_seed(seed)\n",
    "    torch.backends.cudnn.benchmark = False\n",
    "    torch.backends.cudnn.deterministic = True\n",
    "seed_everything(seed)\n",
    "\n",
    "decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens, \"sample_batch_size\": sample_batch_size}\n",
    "from inference_tts_scale import inference_one_sample\n",
    "concated_audio, gen_audio = inference_one_sample(model, Namespace(**config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)\n",
    "        \n",
    "# save segments for comparison\n",
    "concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()\n",
    "# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n",
    "\n",
    "\n",
    "# display the audio\n",
    "from IPython.display import Audio\n",
    "print(\"concatenate prompt and generated:\")\n",
    "display(Audio(concated_audio, rate=codec_audio_sr))\n",
    "\n",
    "print(\"generated:\")\n",
    "display(Audio(gen_audio, rate=codec_audio_sr))\n",
    "\n",
    "# # save the audio\n",
    "# # output_dir\n",
    "# output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n",
    "# os.makedirs(output_dir, exist_ok=True)\n",
    "# seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n",
    "# seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\"        \n",
    "\n",
    "# torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n",
    "# torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n",
    "\n",
    "# you are might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "voicecraft",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
init 2024-03-21 19:02:20 +01:00			`{`
			`"cells": [`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"VoiceCraft Inference Text To Speech Demo\n",`
modify the Dockerfile, download correct lib versions 2024-04-04 21:49:37 +02:00			`"==="`
init 2024-03-21 19:02:20 +01:00			`]`
			`},`
clearer tts instruction 2024-03-30 20:45:26 +01:00			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
modify the Dockerfile, download correct lib versions 2024-04-04 21:49:37 +02:00			`"### Select 'voicecraft' as the kernel"`
clearer tts instruction 2024-03-30 20:45:26 +01:00			`]`
			`},`
init 2024-03-21 19:02:20 +01:00			`{`
			`"cell_type": "code",`
hf model download 2024-04-14 00:11:15 +02:00			`"execution_count": 1,`
init 2024-03-21 19:02:20 +01:00			`"metadata": {},`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"outputs": [],`
init 2024-03-21 19:02:20 +01:00			`"source": [`
			`"# import libs\n",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"# if this throws an error, something went wrong installing dependencies or changing the kernel above!\n",`
			`"import os\n",`
			`"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",`
			`"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",`
hf model download 2024-04-14 00:11:15 +02:00			`"os.environ[\"USER\"] = \"me\" # TODO change this to your username\n",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"\n",`
init 2024-03-21 19:02:20 +01:00			`"import torch\n",`
			`"import torchaudio\n",`
revised env setup, random seed effective 2024-03-31 22:50:20 +02:00			`"import numpy as np\n",`
			`"import random\n",`
avoid zero size batch caused by grad_accumulation 2024-04-09 20:33:58 +02:00			`"from argparse import Namespace\n",`
init 2024-03-21 19:02:20 +01:00			`"\n",`
			`"from data.tokenizer import (\n",`
			`" AudioTokenizer,\n",`
			`" TextTokenizer,\n",`
hf model download 2024-04-14 00:11:15 +02:00			`")\n",`
			`"from huggingface_hub import hf_hub_download"`
init 2024-03-21 19:02:20 +01:00			`]`
			`},`
			`{`
			`"cell_type": "code",`
hf model download 2024-04-14 00:11:15 +02:00			`"execution_count": 2,`
init 2024-03-21 19:02:20 +01:00			`"metadata": {},`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"outputs": [],`
init 2024-03-21 19:02:20 +01:00			`"source": [`
hf model download 2024-04-14 00:11:15 +02:00			`"# # install MFA models and dictionaries if you haven't done so already, already done in the dockerfile or envrionment setup\n",`
			`"# !source ~/.bashrc && \\\n",`
			`"# conda activate voicecraft && \\\n",`
			`"# mfa model download dictionary english_us_arpa && \\\n",`
			`"# mfa model download acoustic english_us_arpa"`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`]`
			`},`
			`{`
			`"cell_type": "code",`
hf model download 2024-04-14 00:11:15 +02:00			`"execution_count": 3,`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`"metadata": {},`
hf model download 2024-04-14 00:11:15 +02:00			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"Dora directory: /tmp/audiocraft_me\n"`
			`]`
			`}`
			`],`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`"source": [`
			`"# load model, encodec, and phn2num\n",`
			`"# # load model, tokenizer, and other necessary files\n",`
init 2024-03-21 19:02:20 +01:00			`"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",`
avoid zero size batch caused by grad_accumulation 2024-04-09 20:33:58 +02:00			`"voicecraft_name=\"giga330M.pth\" # or gigaHalfLibri330M_TTSEnhanced_max16s.pth, giga830M.pth\n",`
			`"\n",`
hf model download 2024-04-14 00:11:15 +02:00			`"# the new way of loading the model, with huggingface, recommended\n",`
better hf integration 2024-04-16 17:55:35 +02:00			`"from models import voicecraft\n",`
			`"model = voicecraft.VoiceCraft.from_pretrained(f\"pyp1/VoiceCraft_{voicecraft_name.replace('.pth', '')}\")\n",`
hf model download 2024-04-14 00:11:15 +02:00			`"phn2num = model.args.phn2num\n",`
			`"config = vars(model.args)\n",`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`"model.to(device)\n",`
			`"\n",`
hf model download 2024-04-14 00:11:15 +02:00			`"\n",`
			`"# # the old way of loading the model\n",`
			`"# from models import voicecraft\n",`
			`"# filepath = hf_hub_download(repo_id=\"pyp1/VoiceCraft\", filename=voicecraft_name, repo_type=\"model\")\n",`
			`"# ckpt = torch.load(filepath, map_location=\"cpu\")\n",`
			`"# model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",`
			`"# model.load_state_dict(ckpt[\"model\"])\n",`
			`"# config = vars(model.args)\n",`
			`"# phn2num = ckpt[\"phn2num\"]\n",`
avoid zero size batch caused by grad_accumulation 2024-04-09 20:33:58 +02:00			`"# model.to(device)\n",`
			`"# model.eval()\n",`
			`"\n",`
			`"\n",`
			`"encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",`
			`"if not os.path.exists(encodec_fn):\n",`
			`" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",`
			`" os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",`
			`"audio_tokenizer = AudioTokenizer(signature=encodec_fn, device=device) # will also put the neural codec model on gpu\n",`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`"\n",`
avoid zero size batch caused by grad_accumulation 2024-04-09 20:33:58 +02:00			`"text_tokenizer = TextTokenizer(backend=\"espeak\")\n"`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`]`
			`},`
			`{`
			`"cell_type": "code",`
hf model download 2024-04-14 00:11:15 +02:00			`"execution_count": 4,`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Prepare your audio\n",`
			`"# point to the original audio whose speech you want to clone\n",`
init 2024-03-21 19:02:20 +01:00			`"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",`
init 2024-03-21 19:02:20 +01:00			`"orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",`
			`"\n",`
			`"# move the audio and transcript to temp folder\n",`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"temp_folder = \"./demo/temp\"\n",`
init 2024-03-21 19:02:20 +01:00			`"os.makedirs(temp_folder, exist_ok=True)\n",`
			`"os.system(f\"cp {orig_audio} {temp_folder}\")\n",`
			`"filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",`
			`"with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",`
			`" f.write(orig_transcript)\n",`
			`"# run MFA to get the alignment\n",`
			`"align_temp = f\"{temp_folder}/mfa_alignments\"\n",`
Add start-jupyter.bat for windows and fixup dependencies. Thanks to [jay-c88](https://github.com/jasonppy/VoiceCraft/pull/25#issuecomment-2027980511) for the windows batch file! This does the thing to actually install the jupyter notebook kernel and instructs the user to refresh their browser to pickup the changes in the notebook. 2024-03-30 15:48:10 +01:00			`"!source ~/.bashrc && \\\n",`
			`" conda activate voicecraft && \\\n",`
			`" mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",`
			`" english_us_arpa english_us_arpa {align_temp}\n",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"\n",`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`"# # if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",`
			`"# !source ~/.bashrc && \\\n",`
			`"# conda activate voicecraft && \\\n",`
			`"# mfa align -v --clean -j 1 --output_format csv {temp_folder} \\\n",`
			`"# english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\n",`
			`"\n"`
init 2024-03-21 19:02:20 +01:00			`]`
			`},`
			`{`
			`"cell_type": "code",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"execution_count": null,`
init 2024-03-21 19:02:20 +01:00			`"metadata": {},`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"outputs": [],`
init 2024-03-21 19:02:20 +01:00			`"source": [`
			`"# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`"cut_off_sec = 3.01 # NOTE: according to forced-alignment file demo/temp/mfa_alignments/84_121550_000074_000000.csv, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",`
init 2024-03-21 19:02:20 +01:00			`"target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`"# NOTE: 3 sec of reference is generally enough for high quality voice cloning, but longer is generally better, try e.g. 3~6 sec.\n",`
			`"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",`
init 2024-03-21 19:02:20 +01:00			`"info = torchaudio.info(audio_fn)\n",`
			`"audio_dur = info.num_frames / info.sample_rate\n",`
			`"\n",`
			`"assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n",`
			`"prompt_end_frame = int(cut_off_sec * info.sample_rate)\n",`
			`"\n",`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`"# run the model to get the output\n",`
			`"# hyperparameters for inference\n",`
			`"codec_audio_sr = 16000\n",`
			`"codec_sr = 50\n",`
			`"top_k = 0\n",`
upload TTS finetuned 330M model 2024-04-06 01:42:59 +02:00			`"top_p = 0.9 # can also try 0.8, but 0.9 seems to work better\n",`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`"temperature = 1\n",`
			`"silence_tokens=[1388,1898,131]\n",`
			`"kvcache = 1 # NOTE if OOM, change this to 0, or try the 330M model\n",`
init 2024-03-21 19:02:20 +01:00			`"\n",`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`"# NOTE adjust the below three arguments if the generation is not as good\n",`
			`"stop_repetition = 3 # NOTE if the model generate long silence, reduce the stop_repetition to 3, 2 or even 1\n",`
hf model download 2024-04-14 00:11:15 +02:00			`"sample_batch_size = 5 # NOTE: if the if there are long silence or unnaturally strecthed words, increase sample_batch_size to 5 or higher. What this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest. So if the speech rate of the generated is too fast change it to a smaller number.\n",`
docker for inference, works on linux and windows 2024-03-30 19:32:09 +01:00			`"seed = 1 # change seed if you are still unhappy with the result\n",`
init 2024-03-21 19:02:20 +01:00			`"\n",`
revised env setup, random seed effective 2024-03-31 22:50:20 +02:00			`"def seed_everything(seed):\n",`
			`" os.environ['PYTHONHASHSEED'] = str(seed)\n",`
			`" random.seed(seed)\n",`
			`" np.random.seed(seed)\n",`
			`" torch.manual_seed(seed)\n",`
			`" torch.cuda.manual_seed(seed)\n",`
			`" torch.backends.cudnn.benchmark = False\n",`
			`" torch.backends.cudnn.deterministic = True\n",`
			`"seed_everything(seed)\n",`
			`"\n",`
init 2024-03-21 19:02:20 +01:00			`"decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens, \"sample_batch_size\": sample_batch_size}\n",`
			`"from inference_tts_scale import inference_one_sample\n",`
avoid zero size batch caused by grad_accumulation 2024-04-09 20:33:58 +02:00			`"concated_audio, gen_audio = inference_one_sample(model, Namespace(**config), phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)\n",`
init 2024-03-21 19:02:20 +01:00			`" \n",`
			`"# save segments for comparison\n",`
			`"concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()\n",`
			`"# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n",`
			`"\n",`
			`"\n",`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"# display the audio\n",`
			`"from IPython.display import Audio\n",`
			`"print(\"concatenate prompt and generated:\")\n",`
			`"display(Audio(concated_audio, rate=codec_audio_sr))\n",`
			`"\n",`
			`"print(\"generated:\")\n",`
			`"display(Audio(gen_audio, rate=codec_audio_sr))\n",`
			`"\n",`
			`"# # save the audio\n",`
			`"# # output_dir\n",`
			`"# output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n",`
			`"# os.makedirs(output_dir, exist_ok=True)\n",`
			`"# seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n",`
			`"# seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\" \n",`
init 2024-03-21 19:02:20 +01:00			`"\n",`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"# torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n",`
			`"# torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n",`
init 2024-03-21 19:02:20 +01:00			`"\n",`
			`"# you are might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored"`
			`]`
avoid zero size batch caused by grad_accumulation 2024-04-09 20:33:58 +02:00			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": []`
init 2024-03-21 19:02:20 +01:00			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "voicecraft",`
			`"language": "python",`
revised env setup, random seed effective 2024-03-31 22:50:20 +02:00			`"name": "python3"`
init 2024-03-21 19:02:20 +01:00			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
revised env setup, random seed effective 2024-03-31 22:50:20 +02:00			`"version": "3.9.18"`
init 2024-03-21 19:02:20 +01:00			`}`
			`},`
			`"nbformat": 4,`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"nbformat_minor": 4`
init 2024-03-21 19:02:20 +01:00			`}`