VoiceCraft/inference_speech_editing.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"   \n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/pyp/miniconda3/envs/voicecraft/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "# import libs\n",
    "import torch\n",
    "import torchaudio\n",
    "\n",
    "from data.tokenizer import (\n",
    "    AudioTokenizer,\n",
    "    TextTokenizer,\n",
    ")\n",
    "\n",
    "from models import voicecraft\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# hyperparameters for inference\n",
    "left_margin = 0.08\n",
    "right_margin = 0.08\n",
    "seed = 1\n",
    "codec_audio_sr = 16000\n",
    "codec_sr = 50\n",
    "top_k = 0\n",
    "top_p = 0.8\n",
    "temperature = 1\n",
    "kvcache = 0\n",
    "silence_tokens = [1388,1898,131]\n",
    "stop_repetition = -1 # do not stop repetition on silence\n",
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "\n",
    "# point to the original file or record the file\n",
    "# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",
    "orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",
    "orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",
    "# move the audio and transcript to temp folder\n",
    "temp_folder = \"./demo/temp\"\n",
    "os.makedirs(temp_folder, exist_ok=True)\n",
    "os.system(f\"cp {orig_audio} {temp_folder}\")\n",
    "filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",
    "with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",
    "    f.write(orig_transcript)\n",
    "# run MFA to get the alignment\n",
    "align_temp = f\"{temp_folder}/mfa_alignments\"\n",
    "os.makedirs(align_temp, exist_ok=True)\n",
    "os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n",
    "# if it fail, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",
    "# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n",
    "audio_fn = f\"{temp_folder}/{filename}.wav\"\n",
    "transcript_fn = f\"{temp_folder}/{filename}.txt\"\n",
    "align_fn = f\"{align_temp}/{filename}.csv\"\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1)\n"
     ]
    }
   ],
   "source": [
    "editTypes_set = set(['substitution', 'insertion', 'deletion'])\n",
    "# propose what do you want the target modified transcript to be\n",
    "target_transcript = \"But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,\"\n",
    "edit_type = \"substitution\"\n",
    "assert edit_type in editTypes_set, f\"Invalid edit type {edit_type}. Must be one of {editTypes_set}.\"\n",
    "\n",
    "# if you want to do a second modification on top of the first one, write down the second modification (target_transcript2, type_of_modification2)\n",
    "# make sure the two modification do not overlap, if they do, you need to combine them into one modification\n",
    "\n",
    "# run the script to turn user input to the format that the model can take\n",
    "from edit_utils import get_span\n",
    "orig_span, new_span = get_span(orig_transcript, target_transcript, edit_type)\n",
    "if orig_span[0] > orig_span[1]:\n",
    "    RuntimeError(f\"example {audio_fn} failed\")\n",
    "if orig_span[0] == orig_span[1]:\n",
    "    orig_span_save = [orig_span[0]]\n",
    "else:\n",
    "    orig_span_save = orig_span\n",
    "if new_span[0] == new_span[1]:\n",
    "    new_span_save = [new_span[0]]\n",
    "else:\n",
    "    new_span_save = new_span\n",
    "\n",
    "orig_span_save = \",\".join([str(item) for item in orig_span_save])\n",
    "new_span_save = \",\".join([str(item) for item in new_span_save])\n",
    "from inference_speech_editing_scale import get_mask_interval\n",
    "\n",
    "start, end = get_mask_interval(align_fn, orig_span_save, edit_type)\n",
    "info = torchaudio.info(audio_fn)\n",
    "audio_dur = info.num_frames / info.sample_rate\n",
    "morphed_span = (max(start - left_margin, 1/codec_sr), min(end + right_margin, audio_dur)) # in seconds\n",
    "\n",
    "# span in codec frames\n",
    "mask_interval = [[round(morphed_span[0]*codec_sr), round(morphed_span[1]*codec_sr)]]\n",
    "mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now\n",
    "\n",
    "# load model, tokenizer, and other necessary files\n",
    "ckpt_fn = \"/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/pretrained_830M/best_bundle.pth\"\n",
    "encodec_fn = \"/data/scratch/pyp/exp_pyp/audiocraft/encodec/xps/6f79c6a8/checkpoint.th\"\n",
    "ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n",
    "model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",
    "model.load_state_dict(ckpt[\"model\"])\n",
    "model.to(device)\n",
    "model.eval()\n",
    "\n",
    "phn2num = ckpt['phn2num']\n",
    "\n",
    "text_tokenizer = TextTokenizer(backend=\"espeak\")\n",
    "audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n",
    "\n",
    "# run the model to get the output\n",
    "from inference_speech_editing_scale import inference_one_sample\n",
    "\n",
    "decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens}\n",
    "orig_audio, new_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, mask_interval, device, decode_config)\n",
    "        \n",
    "# save segments for comparison\n",
    "orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu()\n",
    "# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n",
    "\n",
    "# output_dir\n",
    "output_dir = \"./demo/generated_se\"\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "\n",
    "save_fn_new = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{seed}.wav\"\n",
    "\n",
    "torchaudio.save(save_fn_new, new_audio, codec_audio_sr)\n",
    "\n",
    "save_fn_orig = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav\"\n",
    "if not os.path.isfile(save_fn_orig):\n",
    "    orig_audio, orig_sr = torchaudio.load(audio_fn)\n",
    "    if orig_sr != codec_audio_sr:\n",
    "        orig_audio = torchaudio.transforms.Resample(orig_sr, codec_audio_sr)(orig_audio)\n",
    "    torchaudio.save(save_fn_orig, orig_audio, codec_audio_sr)\n",
    "\n",
    "# if you get error importing T5 in transformers\n",
    "# try \n",
    "# pip uninstall Pillow\n",
    "# pip install Pillow\n",
    "# you are likely to get warning looks like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "voicecraft",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
init 2024-03-21 19:02:20 +01:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "code",`
			`"execution_count": 1,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import os\n",`
			`"os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" \n",`
			`"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\""`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"/home/pyp/miniconda3/envs/voicecraft/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",`
			`" from .autonotebook import tqdm as notebook_tqdm\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"# import libs\n",`
			`"import torch\n",`
			`"import torchaudio\n",`
			`"\n",`
			`"from data.tokenizer import (\n",`
			`" AudioTokenizer,\n",`
			`" TextTokenizer,\n",`
			`")\n",`
			`"\n",`
			`"from models import voicecraft\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# hyperparameters for inference\n",`
			`"left_margin = 0.08\n",`
			`"right_margin = 0.08\n",`
			`"seed = 1\n",`
			`"codec_audio_sr = 16000\n",`
			`"codec_sr = 50\n",`
			`"top_k = 0\n",`
			`"top_p = 0.8\n",`
			`"temperature = 1\n",`
			`"kvcache = 0\n",`
			`"silence_tokens = [1388,1898,131]\n",`
			`"stop_repetition = -1 # do not stop repetition on silence\n",`
			`"device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",`
			`"\n",`
			`"# point to the original file or record the file\n",`
			`"# write down the transcript for the file, or run whisper to get the transcript (and you can modify it if it's not accurate), save it as a .txt file\n",`
			`"orig_audio = \"./demo/84_121550_000074_000000.wav\"\n",`
			`"orig_transcript = \"But when I had approached so near to them The common object, which the sense deceives, Lost not by distance any of its marks,\"\n",`
			`"# move the audio and transcript to temp folder\n",`
			`"temp_folder = \"./demo/temp\"\n",`
			`"os.makedirs(temp_folder, exist_ok=True)\n",`
			`"os.system(f\"cp {orig_audio} {temp_folder}\")\n",`
			`"filename = os.path.splitext(orig_audio.split(\"/\")[-1])[0]\n",`
			`"with open(f\"{temp_folder}/{filename}.txt\", \"w\") as f:\n",`
			`" f.write(orig_transcript)\n",`
			`"# run MFA to get the alignment\n",`
			`"align_temp = f\"{temp_folder}/mfa_alignments\"\n",`
			`"os.makedirs(align_temp, exist_ok=True)\n",`
			`"os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp}\")\n",`
			`"# if it fail, it could be because the audio is too hard for the alignment model, increasing the beam size usually solves the issue\n",`
			`"# os.system(f\"mfa align -j 1 --output_format csv {temp_folder} english_us_arpa english_us_arpa {align_temp} --beam 1000 --retry_beam 2000\")\n",`
			`"audio_fn = f\"{temp_folder}/{filename}.wav\"\n",`
			`"transcript_fn = f\"{temp_folder}/{filename}.txt\"\n",`
			`"align_fn = f\"{align_temp}/{filename}.csv\"\n",`
			`"\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 7,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
			`"WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1)\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"editTypes_set = set(['substitution', 'insertion', 'deletion'])\n",`
			`"# propose what do you want the target modified transcript to be\n",`
			`"target_transcript = \"But when I saw the mirage of the lake in the distance, which the sense deceives, Lost not by distance any of its marks,\"\n",`
			`"edit_type = \"substitution\"\n",`
			`"assert edit_type in editTypes_set, f\"Invalid edit type {edit_type}. Must be one of {editTypes_set}.\"\n",`
			`"\n",`
			`"# if you want to do a second modification on top of the first one, write down the second modification (target_transcript2, type_of_modification2)\n",`
			`"# make sure the two modification do not overlap, if they do, you need to combine them into one modification\n",`
			`"\n",`
			`"# run the script to turn user input to the format that the model can take\n",`
			`"from edit_utils import get_span\n",`
			`"orig_span, new_span = get_span(orig_transcript, target_transcript, edit_type)\n",`
			`"if orig_span[0] > orig_span[1]:\n",`
			`" RuntimeError(f\"example {audio_fn} failed\")\n",`
			`"if orig_span[0] == orig_span[1]:\n",`
			`" orig_span_save = [orig_span[0]]\n",`
			`"else:\n",`
			`" orig_span_save = orig_span\n",`
			`"if new_span[0] == new_span[1]:\n",`
			`" new_span_save = [new_span[0]]\n",`
			`"else:\n",`
			`" new_span_save = new_span\n",`
			`"\n",`
			`"orig_span_save = \",\".join([str(item) for item in orig_span_save])\n",`
			`"new_span_save = \",\".join([str(item) for item in new_span_save])\n",`
			`"from inference_speech_editing_scale import get_mask_interval\n",`
			`"\n",`
			`"start, end = get_mask_interval(align_fn, orig_span_save, edit_type)\n",`
			`"info = torchaudio.info(audio_fn)\n",`
			`"audio_dur = info.num_frames / info.sample_rate\n",`
			`"morphed_span = (max(start - left_margin, 1/codec_sr), min(end + right_margin, audio_dur)) # in seconds\n",`
			`"\n",`
			`"# span in codec frames\n",`
			`"mask_interval = [[round(morphed_span[0]codec_sr), round(morphed_span[1]codec_sr)]]\n",`
			`"mask_interval = torch.LongTensor(mask_interval) # [M,2], M==1 for now\n",`
			`"\n",`
			`"# load model, tokenizer, and other necessary files\n",`
			`"ckpt_fn = \"/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/pretrained_830M/best_bundle.pth\"\n",`
			`"encodec_fn = \"/data/scratch/pyp/exp_pyp/audiocraft/encodec/xps/6f79c6a8/checkpoint.th\"\n",`
			`"ckpt = torch.load(ckpt_fn, map_location=\"cpu\")\n",`
			`"model = voicecraft.VoiceCraft(ckpt[\"config\"])\n",`
			`"model.load_state_dict(ckpt[\"model\"])\n",`
			`"model.to(device)\n",`
			`"model.eval()\n",`
			`"\n",`
			`"phn2num = ckpt['phn2num']\n",`
			`"\n",`
			`"text_tokenizer = TextTokenizer(backend=\"espeak\")\n",`
			`"audio_tokenizer = AudioTokenizer(signature=encodec_fn) # will also put the neural codec model on gpu\n",`
			`"\n",`
			`"# run the model to get the output\n",`
			`"from inference_speech_editing_scale import inference_one_sample\n",`
			`"\n",`
			`"decode_config = {'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'stop_repetition': stop_repetition, 'kvcache': kvcache, \"codec_audio_sr\": codec_audio_sr, \"codec_sr\": codec_sr, \"silence_tokens\": silence_tokens}\n",`
			`"orig_audio, new_audio = inference_one_sample(model, ckpt[\"config\"], phn2num, text_tokenizer, audio_tokenizer, audio_fn, target_transcript, mask_interval, device, decode_config)\n",`
			`" \n",`
			`"# save segments for comparison\n",`
			`"orig_audio, new_audio = orig_audio[0].cpu(), new_audio[0].cpu()\n",`
			`"# logging.info(f\"length of the resynthesize orig audio: {orig_audio.shape}\")\n",`
			`"\n",`
			`"# output_dir\n",`
			`"output_dir = \"./demo/generated_se\"\n",`
			`"os.makedirs(output_dir, exist_ok=True)\n",`
			`"\n",`
			`"save_fn_new = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_new_seed{seed}.wav\"\n",`
			`"\n",`
			`"torchaudio.save(save_fn_new, new_audio, codec_audio_sr)\n",`
			`"\n",`
			`"save_fn_orig = f\"{output_dir}/{os.path.basename(audio_fn)[:-4]}_orig.wav\"\n",`
			`"if not os.path.isfile(save_fn_orig):\n",`
			`" orig_audio, orig_sr = torchaudio.load(audio_fn)\n",`
			`" if orig_sr != codec_audio_sr:\n",`
			`" orig_audio = torchaudio.transforms.Resample(orig_sr, codec_audio_sr)(orig_audio)\n",`
			`" torchaudio.save(save_fn_orig, orig_audio, codec_audio_sr)\n",`
			`"\n",`
			`"# if you get error importing T5 in transformers\n",`
			`"# try \n",`
			`"# pip uninstall Pillow\n",`
			`"# pip install Pillow\n",`
			`"# you are likely to get warning looks like WARNING:phonemizer:words count mismatch on 300.0% of the lines (3/1), this can be safely ignored"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": []`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "voicecraft",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.9.18"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`