diff --git a/Dockerfile b/Dockerfile index 561b4ec..3fbe052 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,6 +11,8 @@ RUN apt-get update && apt-get install -y git-core ffmpeg espeak-ng && \ RUN conda update -y -n base -c conda-forge conda && \ conda create -y -n voicecraft python=3.9.16 && \ conda run -n voicecraft conda install -y -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 && \ + conda run -n voicecraft mfa model download dictionary english_us_arpa && \ + conda run -n voicecraft mfa model download acoustic english_us_arpa && \ conda run -n voicecraft pip install -e git+https://github.com/facebookresearch/audiocraft.git@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft && \ conda run -n voicecraft pip install xformers==0.0.22 && \ conda run -n voicecraft pip install torch==2.0.1 && \ diff --git a/README.md b/README.md index 8b7f984..e292536 100644 --- a/README.md +++ b/README.md @@ -30,9 +30,11 @@ If you want to do model development such as training/finetuning, I recommend fol - [x] Training guidance - [x] RealEdit dataset and training manifest - [x] Model weights (giga330M.pth, giga830M.pth, and gigaHalfLibri330M_TTSEnhanced_max16s.pth) +- [x] Better guidance on training/finetuning - [x] Write colab notebooks for better hands-on experience - [ ] HuggingFace Spaces demo -- [ ] Better guidance on training/finetuning +- [ ] Command line +- [ ] Improve efficiency ## QuickStart Colab @@ -95,6 +97,9 @@ pip install datasets==2.16.0 pip install torchmetrics==0.11.1 # install MFA for getting forced-alignment, this could take a few minutes conda install -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068 +# install MFA english dictionary and model +mfa model download dictionary english_us_arpa +mfa model download acoustic english_us_arpa # conda install pocl # above gives an warning for installing pocl, not sure if really need this # to run ipynb @@ -145,19 +150,16 @@ cd ./z_scripts bash e830M.sh ``` +It's the same procedure to prepare your own custom dataset. Make sure that if + +## Finetuning +You also need to do step 1-4 as Training, and I recommend to use AdamW for optimization if you finetune a pretrained model for better stability. checkout script `/home/pyp/VoiceCraft/z_scripts/e830M_ft.sh`. + +If your dataset introduce new phonemes (which is very likely) that doesn't exist in the giga checkpoint, make sure you combine the original phonemes with the phoneme from your data when construction vocab. And you need to adjust `--text_vocab_size` and `--text_pad_token` so that the former is bigger than or equal to you vocab size, and the latter has the same value as `--text_vocab_size` (i.e. `--text_pad_token` is always the last token). Also since the text embedding are now of a different size, make sure you modify the weights loading part so that I won't crash (you could skip loading `text_embedding` or only load the existing part, and randomly initialize the new) ## License The codebase is under CC BY-NC-SA 4.0 ([LICENSE-CODE](./LICENSE-CODE)), and the model weights are under Coqui Public Model License 1.0.0 ([LICENSE-MODEL](./LICENSE-MODEL)). Note that we use some of the code from other repository that are under different licenses: `./models/codebooks_patterns.py` is under MIT license; `./models/modules`, `./steps/optim.py`, `data/tokenizer.py` are under Apache License, Version 2.0; the phonemizer we used is under GNU 3.0 License. - - ## Acknowledgement We thank Feiteng for his [VALL-E reproduction](https://github.com/lifeiteng/vall-e), and we thank audiocraft team for open-sourcing [encodec](https://github.com/facebookresearch/audiocraft). diff --git a/z_scripts/e830M_ft.sh b/z_scripts/e830M_ft.sh new file mode 100644 index 0000000..9226e5f --- /dev/null +++ b/z_scripts/e830M_ft.sh @@ -0,0 +1,69 @@ +#!/bin/bash +source ~/miniconda3/etc/profile.d/conda.sh +conda activate voicecraft +export CUDA_VISIBLE_DEVICES=0,1,2,3 +export WORLD_SIZE=4 + +dataset=gigaspeech +mkdir -p ./logs/${dataset} + +exp_root="path/to/store/exp_results" +exp_name=e830M_ft +dataset_dir="path/to/stored_extracted_codes_and_phonemes/xl" # xs if you only extracted xs in previous step +encodec_codes_folder_name="encodec_16khz_4codebooks" +load_model_from="/home/pyp/VoiceCraft/pretrained_models/giga830M.pth" + +# export CUDA_LAUNCH_BLOCKING=1 # for debugging + +torchrun --nnodes=1 --rdzv-backend=c10d --rdzv-endpoint=localhost:41977 --nproc_per_node=${WORLD_SIZE} \ +../main.py \ +--load_model_from ${load_model_from} \ +--reduced_eog 1 \ +--drop_long 1 \ +--eos 2051 \ +--n_special 4 \ +--pad_x 0 \ +--codebook_weight "[3,1,1,1]" \ +--encodec_sr 50 \ +--num_steps 500000 \ +--lr 0.00001 \ +--warmup_fraction 0.1 \ +--optimizer_name "AdamW" \ +--d_model 2048 \ +--audio_embedding_dim 2048 \ +--nhead 16 \ +--num_decoder_layers 16 \ +--max_num_tokens 20000 \ +--gradient_accumulation_steps 20 \ +--val_max_num_tokens 6000 \ +--num_buckets 6 \ +--audio_max_length 20 \ +--audio_min_length 2 \ +--text_max_length 400 \ +--text_min_length 10 \ +--mask_len_min 1 \ +--mask_len_max 600 \ +--tb_write_every_n_steps 10 \ +--print_every_n_steps 400 \ +--val_every_n_steps 1600 \ +--text_vocab_size 100 \ +--text_pad_token 100 \ +--phn_folder_name "phonemes" \ +--manifest_name "manifest" \ +--encodec_folder_name ${encodec_codes_folder_name} \ +--audio_vocab_size 2048 \ +--empty_token 2048 \ +--eog 2049 \ +--audio_pad_token 2050 \ +--n_codebooks 4 \ +--max_n_spans 3 \ +--shuffle_mask_embedding 0 \ +--mask_sample_dist poisson1 \ +--max_mask_portion 0.9 \ +--min_gap 5 \ +--num_workers 8 \ +--dynamic_batching 1 \ +--dataset $dataset \ +--exp_dir "${exp_root}/${dataset}/${exp_name}" \ +--dataset_dir ${dataset_dir} +# >> ./logs/${dataset}/${exp_name}.log 2>&1 \ No newline at end of file