VoiceCraft/inference_tts.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "VoiceCraft Inference Text To Speech Demo\n",
    "===\n",
    "This will install a bunch of garbage all over so consider using a docker container to contain the cruft.\n",
    "\n",
    "Run the next 5 cells one at a time then change the Jupyter Notebook Kernel to use the voicecraft environment."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# install OS deps\n",
    "!sudo apt-get update && sudo apt-get install -y \\\n",
    "    git-core \\\n",
    "    ffmpeg \\\n",
    "    espeak-ng"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Update and setup Conda voicecraft environment\n",
    "!conda update -y -n base -c conda-forge conda\n",
    "!conda create -y -n voicecraft python=3.9.16 && \\\n",
    "    conda init bash"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# install conda and pip stuff in the activated conda above context\n",
    "!echo -e \"Grab a cup a coffee and a slice of pizza...\\n\\n\"\n",
    "\n",
    "# make sure $HOME and $USER are setup so this will source the conda environment\n",
    "!source ~/.bashrc && \\\n",
    "    conda activate voicecraft && \\\n",
    "    conda install -y -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 && \\\n",
    "    pip install torch==2.0.1 && \\\n",
    "    pip install tensorboard==2.16.2 && \\\n",
    "    pip install phonemizer==3.2.1 && \\\n",
    "    pip install torchaudio==2.0.2 && \\\n",
    "    pip install datasets==2.16.0 && \\\n",
    "    pip install torchmetrics==0.11.1\n",
    "\n",
    "# do this one last otherwise you'll get an error about torch compiler missing due to xformer mismatch\n",
    "!source ~/.bashrc && \\\n",
    "    conda activate voicecraft && \\\n",
    "    pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# okay setup the conda environment such that jupyter notebook can find the kernel\n",
    "!source ~/.bashrc && \\\n",
    "    conda activate voicecraft && \\\n",
    "    conda install -y -n voicecraft ipykernel --update-deps --force-reinstall"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# STOP\n",
    "You have to do this part manually using the mouse/keyboard and the tabs at the top.\n",
    "\n",
    "* Kernel -> Change Kernel -> Select Kernel -> voicecraft\n",
    "* Kernel -> Restart Kernel -> Yes\n",
    "\n",
    "Now you can run the rest of the notebook and get an audio sample output. It will download more models and such."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import libs\n",
    "# if this throws an error, something went wrong installing dependencies or changing the kernel above!\n",
    "import os\n",
    "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"   \n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
    "\n",
    "import torch\n",
    "import torchaudio\n",
    "\n",
    "from data.tokenizer import (\n",
    "    AudioTokenizer,\n",
    "    TextTokenizer,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# hyperparameters for inference\n",
    "left_margin = 0.08 # not used for TTS, only for speech editing\n",
    "right_margin = 0.08 # not used for TTS, only for speech editing\n",
    "codec_audio_sr = 16000\n",
    "codec_sr = 50\n",
    "top_k = 0\n",
    "top_p = 0.8\n",
    "temperature = 1\n",
    "kvcache = 1\n",
    "silence_tokens=[1388,1898,131]\n",
    "# adjust the below three arguments if the generation is not as good\n",
    "seed = 1 # random seed magic\n",
    "stop_repetition = 3 # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n",
    "sample_batch_size = 4 # if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n",
    "# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest\n",
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "\n",
    "# point to the original file or record the file\n",
    "# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
    "orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
    "orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
    "\n",
    "# move the audio and transcript to temp folder\n",
    "temp_folder = \"./demo/temp\"\n",
    "os.makedirs(temp_folder, exist_ok=True)\n",
    "os.system(f\"cp {orig_audio} {temp_folder}\")\n",
    "filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",
    "with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",
    "    f.write(orig_transcript)\n",
    "# run MFA to get the alignment\n",
    "align_temp = f\"{temp_folder}/mfa_alignments\"\n",
    "os.makedirs(align_temp, exist_ok=True)\n",
    "\n",
    "# get into the conda environment and download the needed MFA models\n",
    "!source ~/.bashrc && \\\n",
    "    conda activate voicecraft && \\\n",
    "    mfa model download dictionary english_us_arpa && \\\n",
    "    mfa model download acoustic english_us_arpa\n",
    "\n",
    "os.system(f\". ~/.bashrc && conda activate voicecraft && mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n",
    "\n",
    "# if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",
    "# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n",
    "audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
    "transcript_fn = f\"{temp_folder}/{filename}.txt\"\n",
    "align_fn = f\"{align_temp}/{filename}.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",
    "cut_off_sec = 3.01 # NOTE: according to forced-alignment file, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",
    "target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",
    "info = torchaudio.info(audio_fn)\n",
    "audio_dur = info.num_frames / info.sample_rate\n",
    "\n",
    "assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n",
    "prompt_end_frame = int(cut_off_sec * info.sample_rate)\n",
    "\n",
    "\n",
    "# # load model, tokenizer, and other necessary files\n",
    "from models import voicecraft\n",
    "#import models.voicecraft as voicecraft\n",
    "voicecraft_name=\"giga830M.pth\"\n",
    "ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",
    "encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",
    "if not os.path.exists(ckpt_fn):\n",
    "    os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n",
    "    os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n",
    "if not os.path.exists(encodec_fn):\n",
    "    os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",
    "    os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",
    "\n",
    "ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n",
    "model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
    "model.load_state_dict(ckpt[\"model\"])\n",
    "model.to(device)\n",
    "model.eval()\n",
    "\n",
    "phn2num = ckpt['phn2num']\n",
    "\n",
    "text_tokenizer = TextTokenizer(backend=\"espeak\")\n",
    "audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n",
    "\n",
    "# run the model to get the output\n",
    "decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens, \"sample_batch_size\": sample_batch_size}\n",
    "from inference_tts_scale import inference_one_sample\n",
    "concated_audio, gen_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)\n",
    "        \n",
    "# save segments for comparison\n",
    "concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()\n",
    "# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n",
    "\n",
    "\n",
    "# display the audio\n",
    "from IPython.display import Audio\n",
    "print(\"concatenate prompt and generated:\")\n",
    "display(Audio(concated_audio, rate=codec_audio_sr))\n",
    "\n",
    "print(\"generated:\")\n",
    "display(Audio(gen_audio, rate=codec_audio_sr))\n",
    "\n",
    "# # save the audio\n",
    "# # output_dir\n",
    "# output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n",
    "# os.makedirs(output_dir, exist_ok=True)\n",
    "# seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n",
    "# seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\"        \n",
    "\n",
    "# torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n",
    "# torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n",
    "\n",
    "# if you get error importing T5 in transformers\n",
    "# try \n",
    "# pip uninstall Pillow\n",
    "# pip install Pillow\n",
    "# you are might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "voicecraft",
   "language": "python",
   "name": "voicecraft"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
init 2024-03-21 19:02:20 +01:00			`{`
			`"cells": [`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"VoiceCraft Inference Text To Speech Demo\n",`
			`"===\n",`
			`"This will install a bunch of garbage all over so consider using a docker container to contain the cruft.\n",`
			`"\n",`
			`"Run the next 5 cells one at a time then change the Jupyter Notebook Kernel to use the voicecraft environment."`
			`]`
			`},`
init 2024-03-21 19:02:20 +01:00			`{`
			`"cell_type": "code",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"execution_count": null,`
init 2024-03-21 19:02:20 +01:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"# install OS deps\n",`
			`"!sudo apt-get update && sudo apt-get install -y \\\n",`
			`" git-core \\\n",`
			`" ffmpeg \\\n",`
			`" espeak-ng"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Update and setup Conda voicecraft environment\n",`
			`"!conda update -y -n base -c conda-forge conda\n",`
			`"!conda create -y -n voicecraft python=3.9.16 && \\\n",`
			`" conda init bash"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# install conda and pip stuff in the activated conda above context\n",`
			`"!echo -e \"Grab a cup a coffee and a slice of pizza...\\n\\n\"\n",`
			`"\n",`
			`"# make sure $HOME and $USER are setup so this will source the conda environment\n",`
			`"!source ~/.bashrc && \\\n",`
			`" conda activate voicecraft && \\\n",`
			`" conda install -y -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 && \\\n",`
			`" pip install torch==2.0.1 && \\\n",`
			`" pip install tensorboard==2.16.2 && \\\n",`
			`" pip install phonemizer==3.2.1 && \\\n",`
			`" pip install torchaudio==2.0.2 && \\\n",`
			`" pip install datasets==2.16.0 && \\\n",`
			`" pip install torchmetrics==0.11.1\n",`
			`"\n",`
			`"# do this one last otherwise you'll get an error about torch compiler missing due to xformer mismatch\n",`
			`"!source ~/.bashrc && \\\n",`
			`" conda activate voicecraft && \\\n",`
			`" pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# okay setup the conda environment such that jupyter notebook can find the kernel\n",`
			`"!source ~/.bashrc && \\\n",`
			`" conda activate voicecraft && \\\n",`
			`" conda install -y -n voicecraft ipykernel --update-deps --force-reinstall"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"# STOP\n",`
			`"You have to do this part manually using the mouse/keyboard and the tabs at the top.\n",`
			`"\n",`
			`"* Kernel -> Change Kernel -> Select Kernel -> voicecraft\n",`
			`"* Kernel -> Restart Kernel -> Yes\n",`
			`"\n",`
			`"Now you can run the rest of the notebook and get an audio sample output. It will download more models and such."`
init 2024-03-21 19:02:20 +01:00			`]`
			`},`
			`{`
			`"cell_type": "code",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"execution_count": null,`
init 2024-03-21 19:02:20 +01:00			`"metadata": {},`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"outputs": [],`
init 2024-03-21 19:02:20 +01:00			`"source": [`
			`"# import libs\n",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"# if this throws an error, something went wrong installing dependencies or changing the kernel above!\n",`
			`"import os\n",`
			`"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",`
			`"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",`
			`"\n",`
init 2024-03-21 19:02:20 +01:00			`"import torch\n",`
			`"import torchaudio\n",`
			`"\n",`
			`"from data.tokenizer import (\n",`
			`" AudioTokenizer,\n",`
			`" TextTokenizer,\n",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`")"`
init 2024-03-21 19:02:20 +01:00			`]`
			`},`
			`{`
			`"cell_type": "code",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"execution_count": null,`
init 2024-03-21 19:02:20 +01:00			`"metadata": {},`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"outputs": [],`
init 2024-03-21 19:02:20 +01:00			`"source": [`
			`"# hyperparameters for inference\n",`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"left_margin = 0.08 # not used for TTS, only for speech editing\n",`
			`"right_margin = 0.08 # not used for TTS, only for speech editing\n",`
init 2024-03-21 19:02:20 +01:00			`"codec_audio_sr = 16000\n",`
			`"codec_sr = 50\n",`
			`"top_k = 0\n",`
			`"top_p = 0.8\n",`
			`"temperature = 1\n",`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"kvcache = 1\n",`
init 2024-03-21 19:02:20 +01:00			`"silence_tokens=[1388,1898,131]\n",`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"# adjust the below three arguments if the generation is not as good\n",`
			`"seed = 1 # random seed magic\n",`
			`"stop_repetition = 3 # if there are long silence in the generated audio, reduce the stop_repetition to 3, 2 or even 1\n",`
			`"sample_batch_size = 4 # if there are long silence or unnaturally strecthed words, increase sample_batch_size to 2, 3 or even 4\n",`
init 2024-03-21 19:02:20 +01:00			`"# what this will do to the model is that the model will run sample_batch_size examples of the same audio, and pick the one that's the shortest\n",`
			`"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",`
			`"\n",`
			`"# point to the original file or record the file\n",`
			`"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",`
init 2024-03-21 19:02:20 +01:00			`"orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",`
			`"\n",`
			`"# move the audio and transcript to temp folder\n",`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"temp_folder = \"./demo/temp\"\n",`
init 2024-03-21 19:02:20 +01:00			`"os.makedirs(temp_folder, exist_ok=True)\n",`
			`"os.system(f\"cp {orig_audio} {temp_folder}\")\n",`
			`"filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",`
			`"with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",`
			`" f.write(orig_transcript)\n",`
			`"# run MFA to get the alignment\n",`
			`"align_temp = f\"{temp_folder}/mfa_alignments\"\n",`
			`"os.makedirs(align_temp, exist_ok=True)\n",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"\n",`
			`"# get into the conda environment and download the needed MFA models\n",`
			`"!source ~/.bashrc && \\\n",`
			`" conda activate voicecraft && \\\n",`
			`" mfa model download dictionary english_us_arpa && \\\n",`
			`" mfa model download acoustic english_us_arpa\n",`
			`"\n",`
			`"os.system(f\". ~/.bashrc && conda activate voicecraft && mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n",`
			`"\n",`
init 2024-03-21 19:02:20 +01:00			`"# if the above fails, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",`
			`"# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n",`
			`"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",`
			`"transcript_fn = f\"{temp_folder}/{filename}.txt\"\n",`
			`"align_fn = f\"{align_temp}/{filename}.csv\""`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"execution_count": null,`
init 2024-03-21 19:02:20 +01:00			`"metadata": {},`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"outputs": [],`
init 2024-03-21 19:02:20 +01:00			`"source": [`
			`"# take a look at demo/temp/mfa_alignment, decide which part of the audio to use as prompt\n",`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"cut_off_sec = 3.01 # NOTE: according to forced-alignment file, the word \"common\" stop as 3.01 sec, this should be different for different audio\n",`
init 2024-03-21 19:02:20 +01:00			`"target_transcript = \"But when I had approached so near to them The common I cannot believe that the same model can also do text to speech synthesis as well!\"\n",`
			`"info = torchaudio.info(audio_fn)\n",`
			`"audio_dur = info.num_frames / info.sample_rate\n",`
			`"\n",`
			`"assert cut_off_sec < audio_dur, f\"cut_off_sec {cut_off_sec} is larger than the audio duration {audio_dur}\"\n",`
			`"prompt_end_frame = int(cut_off_sec * info.sample_rate)\n",`
			`"\n",`
			`"\n",`
			`"# # load model, tokenizer, and other necessary files\n",`
			`"from models import voicecraft\n",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"#import models.voicecraft as voicecraft\n",`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"voicecraft_name=\"giga830M.pth\"\n",`
			`"ckpt_fn =f\"./pretrained_models/{voicecraft_name}\"\n",`
			`"encodec_fn = \"./pretrained_models/encodec_4cb2048_giga.th\"\n",`
			`"if not os.path.exists(ckpt_fn):\n",`
			`" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/{voicecraft_name}\\?download\\=true\")\n",`
			`" os.system(f\"mv {voicecraft_name}\\?download\\=true ./pretrained_models/{voicecraft_name}\")\n",`
			`"if not os.path.exists(encodec_fn):\n",`
			`" os.system(f\"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th\")\n",`
			`" os.system(f\"mv encodec_4cb2048_giga.th ./pretrained_models/encodec_4cb2048_giga.th\")\n",`
			`"\n",`
init 2024-03-21 19:02:20 +01:00			`"ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n",`
			`"model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",`
			`"model.load_state_dict(ckpt[\"model\"])\n",`
			`"model.to(device)\n",`
			`"model.eval()\n",`
			`"\n",`
			`"phn2num = ckpt['phn2num']\n",`
			`"\n",`
			`"text_tokenizer = TextTokenizer(backend=\"espeak\")\n",`
			`"audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n",`
			`"\n",`
			`"# run the model to get the output\n",`
			`"decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens, \"sample_batch_size\": sample_batch_size}\n",`
			`"from inference_tts_scale import inference_one_sample\n",`
			`"concated_audio, gen_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, device, decode_config, prompt_end_frame)\n",`
			`" \n",`
			`"# save segments for comparison\n",`
			`"concated_audio, gen_audio = concated_audio[0].cpu(), gen_audio[0].cpu()\n",`
			`"# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n",`
			`"\n",`
			`"\n",`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"# display the audio\n",`
			`"from IPython.display import Audio\n",`
			`"print(\"concatenate prompt and generated:\")\n",`
			`"display(Audio(concated_audio, rate=codec_audio_sr))\n",`
			`"\n",`
			`"print(\"generated:\")\n",`
			`"display(Audio(gen_audio, rate=codec_audio_sr))\n",`
			`"\n",`
			`"# # save the audio\n",`
			`"# # output_dir\n",`
			`"# output_dir = \"/home/pyp/VoiceCraft/demo/generated_tts\"\n",`
			`"# os.makedirs(output_dir, exist_ok=True)\n",`
			`"# seg_save_fn_gen = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_gen_seed{seed}.wav\"\n",`
			`"# seg_save_fn_concat = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_concat_seed{seed}.wav\" \n",`
init 2024-03-21 19:02:20 +01:00			`"\n",`
weights, notebook working 2024-03-29 00:21:30 +01:00			`"# torchaudio.save(seg_save_fn_gen, gen_audio, codec_audio_sr)\n",`
			`"# torchaudio.save(seg_save_fn_concat, concated_audio, codec_audio_sr)\n",`
init 2024-03-21 19:02:20 +01:00			`"\n",`
			`"# if you get error importing T5 in transformers\n",`
			`"# try \n",`
			`"# pip uninstall Pillow\n",`
			`"# pip install Pillow\n",`
			`"# you are might get warnings like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "voicecraft",`
			`"language": "python",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"name": "voicecraft"`
init 2024-03-21 19:02:20 +01:00			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"version": "3.9.19"`
init 2024-03-21 19:02:20 +01:00			`}`
			`},`
			`"nbformat": 4,`
Automate instalation of dependencies in notebook. Add a note in README about running in docker too to reduce cruft on your host box. Good luck, be nice! 2024-03-29 20:27:12 +01:00			`"nbformat_minor": 4`
init 2024-03-21 19:02:20 +01:00			`}`