Spaces:

mrfakename
/

E2-F5-TTS

Running on Zero

App Files Files Community

mrfakename commited on Nov 27, 2024

Commit

b5979c9

verified ·

1 Parent(s): b164af3

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (12) hide show

pyproject.toml +1 -0
src/f5_tts/configs/E2TTS_Base_train.yaml +43 -0
src/f5_tts/configs/E2TTS_Small_train.yaml +43 -0
src/f5_tts/configs/F5TTS_Base_train.yaml +45 -0
src/f5_tts/configs/F5TTS_Small_train.yaml +45 -0
src/f5_tts/eval/README.md +2 -2
src/f5_tts/eval/eval_infer_batch.py +2 -2
src/f5_tts/eval/eval_librispeech_test_clean.py +63 -52
src/f5_tts/eval/eval_seedtts_testset.py +63 -54
src/f5_tts/train/README.md +4 -1
src/f5_tts/train/datasets/prepare_ljspeech.py +64 -0
src/f5_tts/train/train.py +39 -68

pyproject.toml CHANGED Viewed

@@ -39,6 +39,7 @@ dependencies = [
     "vocos",
     "wandb",
     "x_transformers>=1.31.14",
 ]
 [project.optional-dependencies]

     "vocos",
     "wandb",
     "x_transformers>=1.31.14",
+    "hydra-core>=1.3.0",
 ]
 [project.optional-dependencies]

src/f5_tts/configs/E2TTS_Base_train.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN # dataset name
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16 # number of workers
+optim:
+  epochs: 15 # max epochs
+  learning_rate: 7.5e-5 # learning rate
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0 # gradient clipping
+  bnb_optimizer: False # use bnb optimizer or not
+model:
+  name: E2TTS_Base # model name
+  tokenizer: pinyin # tokenizer type
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 1024 # model dimension
+    depth: 24 # number of transformer layers
+    heads: 16 # number of transformer heads
+    ff_mult: 4 # ff layer expansion
+  mel_spec:
+    target_sample_rate: 24000 # target sample rate
+    n_mel_channels: 100 # mel channel
+    hop_length: 256 # hop length
+    win_length: 1024 # window length
+    n_fft: 1024 # fft length
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+    is_local_vocoder: False # use local vocoder or not
+    local_vocoder_path: None # path to local vocoder
+ckpts:
+  logger: wandb # wandb | tensorboard | None
+  save_per_updates: 50000 # save checkpoint per steps
+  last_per_steps: 5000 # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

src/f5_tts/configs/E2TTS_Small_train.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16 # number of workers
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0
+  bnb_optimizer: False
+model:
+  name: E2TTS_Small
+  tokenizer: pinyin
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 768
+    depth: 20
+    heads: 12
+    ff_mult: 4
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+    is_local_vocoder: False
+    local_vocoder_path: None
+ckpts:
+  logger: wandb # wandb | tensorboard | None
+  save_per_updates: 50000 # save checkpoint per steps
+  last_per_steps: 5000 # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

src/f5_tts/configs/F5TTS_Base_train.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN # dataset name
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16 # number of workers
+optim:
+  epochs: 15 # max epochs
+  learning_rate: 7.5e-5 # learning rate
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0 # gradient clipping
+  bnb_optimizer: False # use bnb optimizer or not
+model:
+  name: F5TTS_Base # model name
+  tokenizer: pinyin # tokenizer type
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 1024 # model dim
+    depth: 22 # model depth
+    heads: 16 # model heads
+    ff_mult: 2 # feedforward expansion
+    text_dim: 512 # text encoder dim
+    conv_layers: 4 # convolution layers
+  mel_spec:
+    target_sample_rate: 24000 # target sample rate
+    n_mel_channels: 100 # mel channel
+    hop_length: 256 # hop length
+    win_length: 1024 # window length
+    n_fft: 1024 # fft length
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+    is_local_vocoder: False # use local vocoder or not
+    local_vocoder_path: None # local vocoder path
+ckpts:
+  logger: wandb # wandb | tensorboard | None
+  save_per_updates: 50000 # save checkpoint per steps
+  last_per_steps: 5000 # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

src/f5_tts/configs/F5TTS_Small_train.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN
+  batch_size_per_gpu: 38400  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame # "frame" or "sample"
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 16 # number of workers
+optim:
+  epochs: 15
+  learning_rate: 7.5e-5
+  num_warmup_updates: 20000  # warmup steps
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0
+  bnb_optimizer: False
+model:
+  name: F5TTS_Small
+  tokenizer: pinyin
+  tokenizer_path: None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
+  arch:
+    dim: 768
+    depth: 18
+    heads: 12
+    ff_mult: 2
+    text_dim: 512
+    conv_layers: 4
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # 'vocos' or 'bigvgan'
+    is_local_vocoder: False
+    local_vocoder_path: None
+ckpts:
+  logger: wandb # wandb | tensorboard | None
+  save_per_updates: 50000 # save checkpoint per steps
+  last_per_steps: 5000 # save last checkpoint per steps
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}

src/f5_tts/eval/README.md CHANGED Viewed

@@ -42,8 +42,8 @@ Then update in the following scripts with the paths you put evaluation model ckp
 Update the path with your batch-inferenced results, and carry out WER / SIM evaluations:
 ```bash
 # Evaluation for Seed-TTS test set
-python src/f5_tts/eval/eval_seedtts_testset.py
 # Evaluation for LibriSpeech-PC test-clean (cross-sentence)
-python src/f5_tts/eval/eval_librispeech_test_clean.py
 ```

 Update the path with your batch-inferenced results, and carry out WER / SIM evaluations:
 ```bash
 # Evaluation for Seed-TTS test set
+python src/f5_tts/eval/eval_seedtts_testset.py --gen_wav_dir <GEN_WAVE_DIR>
 # Evaluation for LibriSpeech-PC test-clean (cross-sentence)
+python src/f5_tts/eval/eval_librispeech_test_clean.py --gen_wav_dir <GEN_WAVE_DIR> --librispeech_test_clean_path <TEST_CLEAN_PATH>
 ```

src/f5_tts/eval/eval_infer_batch.py CHANGED Viewed

@@ -34,8 +34,6 @@ win_length = 1024
 n_fft = 1024
 target_rms = 0.1
-tokenizer = "pinyin"
 rel_path = str(files("f5_tts").joinpath("../../"))
@@ -49,6 +47,7 @@ def main():
     parser.add_argument("-n", "--expname", required=True)
     parser.add_argument("-c", "--ckptstep", default=1200000, type=int)
     parser.add_argument("-m", "--mel_spec_type", default="vocos", type=str, choices=["bigvgan", "vocos"])
     parser.add_argument("-nfe", "--nfestep", default=32, type=int)
     parser.add_argument("-o", "--odemethod", default="euler")
@@ -64,6 +63,7 @@ def main():
     ckpt_step = args.ckptstep
     ckpt_path = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}.pt"
     mel_spec_type = args.mel_spec_type
     nfe_step = args.nfestep
     ode_method = args.odemethod

 n_fft = 1024
 target_rms = 0.1
 rel_path = str(files("f5_tts").joinpath("../../"))
     parser.add_argument("-n", "--expname", required=True)
     parser.add_argument("-c", "--ckptstep", default=1200000, type=int)
     parser.add_argument("-m", "--mel_spec_type", default="vocos", type=str, choices=["bigvgan", "vocos"])
+    parser.add_argument("-to", "--tokenizer", default="pinyin", type=str, choices=["pinyin", "char"])
     parser.add_argument("-nfe", "--nfestep", default=32, type=int)
     parser.add_argument("-o", "--odemethod", default="euler")
     ckpt_step = args.ckptstep
     ckpt_path = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}.pt"
     mel_spec_type = args.mel_spec_type
+    tokenizer = args.tokenizer
     nfe_step = args.nfestep
     ode_method = args.odemethod

src/f5_tts/eval/eval_librispeech_test_clean.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import sys
 import os
 sys.path.append(os.getcwd())
@@ -19,55 +20,65 @@ from f5_tts.eval.utils_eval import (
 rel_path = str(files("f5_tts").joinpath("../../"))
-eval_task = "wer"  # sim | wer
-lang = "en"
-metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
-librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean"  # test-clean path
-gen_wav_dir = "PATH_TO_GENERATED"  # generated wavs
-gpus = [0, 1, 2, 3, 4, 5, 6, 7]
-test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
-## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
-## leading to a low similarity for the ground truth in some cases.
-# test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = True)  # eval ground truth
-local = False
-if local:  # use local custom checkpoint dir
-    asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
-else:
-    asr_ckpt_dir = ""  # auto download to cache dir
-wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
-# --------------------------- WER ---------------------------
-if eval_task == "wer":
-    wers = []
-    with mp.Pool(processes=len(gpus)) as pool:
-        args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
-        results = pool.map(run_asr_wer, args)
-        for wers_ in results:
-            wers.extend(wers_)
-    wer = round(np.mean(wers) * 100, 3)
-    print(f"\nTotal {len(wers)} samples")
-    print(f"WER      : {wer}%")
-# --------------------------- SIM ---------------------------
-if eval_task == "sim":
-    sim_list = []
-    with mp.Pool(processes=len(gpus)) as pool:
-        args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
-        results = pool.map(run_sim, args)
-        for sim_ in results:
-            sim_list.extend(sim_)
-    sim = round(sum(sim_list) / len(sim_list), 3)
-    print(f"\nTotal {len(sim_list)} samples")
-    print(f"SIM      : {sim}")

 import sys
 import os
+import argparse
 sys.path.append(os.getcwd())
 rel_path = str(files("f5_tts").joinpath("../../"))
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
+    parser.add_argument("-l", "--lang", type=str, default="en")
+    parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
+    parser.add_argument("-p", "--librispeech_test_clean_path", type=str, required=True)
+    parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
+    parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
+    return parser.parse_args()
+def main():
+    args = get_args()
+    eval_task = args.eval_task
+    lang = args.lang
+    librispeech_test_clean_path = args.librispeech_test_clean_path  # test-clean path
+    gen_wav_dir = args.gen_wav_dir
+    metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
+    gpus = list(range(args.gpu_nums))
+    test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
+    ## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
+    ## leading to a low similarity for the ground truth in some cases.
+    # test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = True)  # eval ground truth
+    local = args.local
+    if local:  # use local custom checkpoint dir
+        asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
+    else:
+        asr_ckpt_dir = ""  # auto download to cache dir
+    wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
+    # --------------------------- WER ---------------------------
+    if eval_task == "wer":
+        wers = []
+        with mp.Pool(processes=len(gpus)) as pool:
+            args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
+            results = pool.map(run_asr_wer, args)
+            for wers_ in results:
+                wers.extend(wers_)
+        wer = round(np.mean(wers) * 100, 3)
+        print(f"\nTotal {len(wers)} samples")
+        print(f"WER      : {wer}%")
+    # --------------------------- SIM ---------------------------
+    if eval_task == "sim":
+        sim_list = []
+        with mp.Pool(processes=len(gpus)) as pool:
+            args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
+            results = pool.map(run_sim, args)
+            for sim_ in results:
+                sim_list.extend(sim_)
+        sim = round(sum(sim_list) / len(sim_list), 3)
+        print(f"\nTotal {len(sim_list)} samples")
+        print(f"SIM      : {sim}")
+if __name__ == "__main__":
+    main()

src/f5_tts/eval/eval_seedtts_testset.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import sys
 import os
 sys.path.append(os.getcwd())
@@ -19,57 +20,65 @@ from f5_tts.eval.utils_eval import (
 rel_path = str(files("f5_tts").joinpath("../../"))
-eval_task = "wer"  # sim | wer
-lang = "zh"  # zh | en
-metalst = rel_path + f"/data/seedtts_testset/{lang}/meta.lst"  # seed-tts testset
-# gen_wav_dir = rel_path + f"/data/seedtts_testset/{lang}/wavs"  # ground truth wavs
-gen_wav_dir = "PATH_TO_GENERATED"  # generated wavs
-# NOTE. paraformer-zh result will be slightly different according to the number of gpus, cuz batchsize is different
-#       zh 1.254 seems a result of 4 workers wer_seed_tts
-gpus = [0, 1, 2, 3, 4, 5, 6, 7]
-test_set = get_seed_tts_test(metalst, gen_wav_dir, gpus)
-local = False
-if local:  # use local custom checkpoint dir
-    if lang == "zh":
-        asr_ckpt_dir = "../checkpoints/funasr"  # paraformer-zh dir under funasr
-    elif lang == "en":
-        asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
-else:
-    asr_ckpt_dir = ""  # auto download to cache dir
-wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
-# --------------------------- WER ---------------------------
-if eval_task == "wer":
-    wers = []
-    with mp.Pool(processes=len(gpus)) as pool:
-        args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
-        results = pool.map(run_asr_wer, args)
-        for wers_ in results:
-            wers.extend(wers_)
-    wer = round(np.mean(wers) * 100, 3)
-    print(f"\nTotal {len(wers)} samples")
-    print(f"WER      : {wer}%")
-# --------------------------- SIM ---------------------------
-if eval_task == "sim":
-    sim_list = []
-    with mp.Pool(processes=len(gpus)) as pool:
-        args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
-        results = pool.map(run_sim, args)
-        for sim_ in results:
-            sim_list.extend(sim_)
-    sim = round(sum(sim_list) / len(sim_list), 3)
-    print(f"\nTotal {len(sim_list)} samples")
-    print(f"SIM      : {sim}")

 import sys
 import os
+import argparse
 sys.path.append(os.getcwd())
 rel_path = str(files("f5_tts").joinpath("../../"))
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
+    parser.add_argument("-l", "--lang", type=str, default="en", choices=["zh", "en"])
+    parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
+    parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
+    parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
+    return parser.parse_args()
+def main():
+    args = get_args()
+    eval_task = args.eval_task
+    lang = args.lang
+    gen_wav_dir = args.gen_wav_dir
+    metalst = rel_path + f"/data/seedtts_testset/{lang}/meta.lst"  # seed-tts testset
+    # NOTE. paraformer-zh result will be slightly different according to the number of gpus, cuz batchsize is different
+    #       zh 1.254 seems a result of 4 workers wer_seed_tts
+    gpus = list(range(args.gpu_nums))
+    test_set = get_seed_tts_test(metalst, gen_wav_dir, gpus)
+    local = args.local
+    if local:  # use local custom checkpoint dir
+        if lang == "zh":
+            asr_ckpt_dir = "../checkpoints/funasr"  # paraformer-zh dir under funasr
+        elif lang == "en":
+            asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
+    else:
+        asr_ckpt_dir = ""  # auto download to cache dir
+    wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
+    # --------------------------- WER ---------------------------
+    if eval_task == "wer":
+        wers = []
+        with mp.Pool(processes=len(gpus)) as pool:
+            args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
+            results = pool.map(run_asr_wer, args)
+            for wers_ in results:
+                wers.extend(wers_)
+        wer = round(np.mean(wers) * 100, 3)
+        print(f"\nTotal {len(wers)} samples")
+        print(f"WER      : {wer}%")
+    # --------------------------- SIM ---------------------------
+    if eval_task == "sim":
+        sim_list = []
+        with mp.Pool(processes=len(gpus)) as pool:
+            args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
+            results = pool.map(run_sim, args)
+            for sim_ in results:
+                sim_list.extend(sim_)
+        sim = round(sum(sim_list) / len(sim_list), 3)
+        print(f"\nTotal {len(sim_list)} samples")
+        print(f"SIM      : {sim}")
+if __name__ == "__main__":
+    main()

src/f5_tts/train/README.md CHANGED Viewed

@@ -16,6 +16,9 @@ python src/f5_tts/train/datasets/prepare_wenetspeech4tts.py
 # Prepare the LibriTTS dataset
 python src/f5_tts/train/datasets/prepare_libritts.py
 ```
 ### 2. Create custom dataset with metadata.csv
@@ -35,7 +38,7 @@ Once your datasets are prepared, you can start the training process.
 # setup accelerate config, e.g. use multi-gpu ddp, fp16
 # will be to: ~/.cache/huggingface/accelerate/default_config.yaml
 accelerate config
-accelerate launch src/f5_tts/train/train.py
 ```
 ### 2. Finetuning practice

 # Prepare the LibriTTS dataset
 python src/f5_tts/train/datasets/prepare_libritts.py
+# Prepare the LJSpeech dataset
+python src/f5_tts/train/datasets/prepare_ljspeech.py
 ```
 ### 2. Create custom dataset with metadata.csv
 # setup accelerate config, e.g. use multi-gpu ddp, fp16
 # will be to: ~/.cache/huggingface/accelerate/default_config.yaml
 accelerate config
+accelerate launch src/f5_tts/train/train.py --config-name F5TTS_Base_train.yaml # F5TTS_Base_train.yaml | E2TTS_Base_train.yaml
 ```
 ### 2. Finetuning practice

src/f5_tts/train/datasets/prepare_ljspeech.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+import sys
+sys.path.append(os.getcwd())
+import json
+from importlib.resources import files
+from pathlib import Path
+from tqdm import tqdm
+import soundfile as sf
+from datasets.arrow_writer import ArrowWriter
+def main():
+    result = []
+    duration_list = []
+    text_vocab_set = set()
+    with open(meta_info, "r") as f:
+        lines = f.readlines()
+        for line in tqdm(lines):
+            uttr, text, norm_text = line.split("|")
+            wav_path = Path(dataset_dir) / "wavs" / f"{uttr}.wav"
+            duration = sf.info(wav_path).duration
+            if duration < 0.4 or duration > 30:
+                continue
+            result.append({"audio_path": str(wav_path), "text": norm_text, "duration": duration})
+            duration_list.append(duration)
+            text_vocab_set.update(list(norm_text))
+    # save preprocessed dataset to disk
+    if not os.path.exists(f"{save_dir}"):
+        os.makedirs(f"{save_dir}")
+    print(f"\nSaving to {save_dir} ...")
+    with ArrowWriter(path=f"{save_dir}/raw.arrow") as writer:
+        for line in tqdm(result, desc="Writing to raw.arrow ..."):
+            writer.write(line)
+    # dup a json separately saving duration in case for DynamicBatchSampler ease
+    with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
+        json.dump({"duration": duration_list}, f, ensure_ascii=False)
+    # vocab map, i.e. tokenizer
+    # add alphabets and symbols (optional, if plan to ft on de/fr etc.)
+    with open(f"{save_dir}/vocab.txt", "w") as f:
+        for vocab in sorted(text_vocab_set):
+            f.write(vocab + "\n")
+    print(f"\nFor {dataset_name}, sample count: {len(result)}")
+    print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}")
+    print(f"For {dataset_name}, total {sum(duration_list)/3600:.2f} hours")
+if __name__ == "__main__":
+    tokenizer = "char"  # "pinyin" | "char"
+    dataset_dir = "<SOME_PATH>/LJSpeech-1.1"
+    dataset_name = f"LJSpeech_{tokenizer}"
+    meta_info = os.path.join(dataset_dir, "metadata.csv")
+    save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
+    print(f"\nPrepare for {dataset_name}, will save to {save_dir}\n")
+    main()

src/f5_tts/train/train.py CHANGED Viewed

@@ -1,100 +1,71 @@
 # training script.
 from importlib.resources import files
 from f5_tts.model import CFM, DiT, Trainer, UNetT
 from f5_tts.model.dataset import load_dataset
 from f5_tts.model.utils import get_tokenizer
-# -------------------------- Dataset Settings --------------------------- #
-target_sample_rate = 24000
-n_mel_channels = 100
-hop_length = 256
-win_length = 1024
-n_fft = 1024
-mel_spec_type = "vocos"  # 'vocos' or 'bigvgan'
-tokenizer = "pinyin"  # 'pinyin', 'char', or 'custom'
-tokenizer_path = None  # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
-dataset_name = "Emilia_ZH_EN"
-# -------------------------- Training Settings -------------------------- #
-exp_name = "F5TTS_Base"  # F5TTS_Base | E2TTS_Base
-learning_rate = 7.5e-5
-batch_size_per_gpu = 38400  # 8 GPUs, 8 * 38400 = 307200
-batch_size_type = "frame"  # "frame" or "sample"
-max_samples = 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
-grad_accumulation_steps = 1  # note: updates = steps / grad_accumulation_steps
-max_grad_norm = 1.0
-epochs = 11  # use linear decay, thus epochs control the slope
-num_warmup_updates = 20000  # warmup steps
-save_per_updates = 50000  # save checkpoint per steps
-last_per_steps = 5000  # save last checkpoint per steps
-# model params
-if exp_name == "F5TTS_Base":
-    wandb_resume_id = None
-    model_cls = DiT
-    model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
-elif exp_name == "E2TTS_Base":
-    wandb_resume_id = None
-    model_cls = UNetT
-    model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
-# ----------------------------------------------------------------------- #
-def main():
-    if tokenizer == "custom":
-        tokenizer_path = tokenizer_path
     else:
-        tokenizer_path = dataset_name
     vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
-    mel_spec_kwargs = dict(
-        n_fft=n_fft,
-        hop_length=hop_length,
-        win_length=win_length,
-        n_mel_channels=n_mel_channels,
-        target_sample_rate=target_sample_rate,
-        mel_spec_type=mel_spec_type,
-    )
     model = CFM(
-        transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
-        mel_spec_kwargs=mel_spec_kwargs,
         vocab_char_map=vocab_char_map,
     )
     trainer = Trainer(
         model,
-        epochs,
-        learning_rate,
-        num_warmup_updates=num_warmup_updates,
-        save_per_updates=save_per_updates,
-        checkpoint_path=str(files("f5_tts").joinpath(f"../../ckpts/{exp_name}")),
-        batch_size=batch_size_per_gpu,
-        batch_size_type=batch_size_type,
-        max_samples=max_samples,
-        grad_accumulation_steps=grad_accumulation_steps,
-        max_grad_norm=max_grad_norm,
         wandb_project="CFM-TTS",
         wandb_run_name=exp_name,
         wandb_resume_id=wandb_resume_id,
-        last_per_steps=last_per_steps,
         log_samples=True,
         mel_spec_type=mel_spec_type,
     )
-    train_dataset = load_dataset(dataset_name, tokenizer, mel_spec_kwargs=mel_spec_kwargs)
     trainer.train(
         train_dataset,
         resumable_with_seed=666,  # seed for shuffling dataset
     )

 # training script.
+import os
 from importlib.resources import files
+import hydra
 from f5_tts.model import CFM, DiT, Trainer, UNetT
 from f5_tts.model.dataset import load_dataset
 from f5_tts.model.utils import get_tokenizer
+os.chdir(str(files("f5_tts").joinpath("../..")))
+@hydra.main(version_base="1.3", config_path=str(files("f5_tts").joinpath("configs")), config_name=None)
+def main(cfg):
+    tokenizer = cfg.model.tokenizer
+    mel_spec_type = cfg.model.mel_spec.mel_spec_type
+    exp_name = f"{cfg.model.name}_{mel_spec_type}_{cfg.model.tokenizer}_{cfg.datasets.name}"
+    # set text tokenizer
+    if tokenizer != "custom":
+        tokenizer_path = cfg.datasets.name
     else:
+        tokenizer_path = cfg.model.tokenizer_path
     vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
+    # set model
+    if "F5TTS" in cfg.model.name:
+        model_cls = DiT
+    elif "E2TTS" in cfg.model.name:
+        model_cls = UNetT
+    wandb_resume_id = None
     model = CFM(
+        transformer=model_cls(**cfg.model.arch, text_num_embeds=vocab_size, mel_dim=cfg.model.mel_spec.n_mel_channels),
+        mel_spec_kwargs=cfg.model.mel_spec,
         vocab_char_map=vocab_char_map,
     )
+    # init trainer
     trainer = Trainer(
         model,
+        epochs=cfg.optim.epochs,
+        learning_rate=cfg.optim.learning_rate,
+        num_warmup_updates=cfg.optim.num_warmup_updates,
+        save_per_updates=cfg.ckpts.save_per_updates,
+        checkpoint_path=str(files("f5_tts").joinpath(f"../../{cfg.ckpts.save_dir}")),
+        batch_size=cfg.datasets.batch_size_per_gpu,
+        batch_size_type=cfg.datasets.batch_size_type,
+        max_samples=cfg.datasets.max_samples,
+        grad_accumulation_steps=cfg.optim.grad_accumulation_steps,
+        max_grad_norm=cfg.optim.max_grad_norm,
+        logger=cfg.ckpts.logger,
         wandb_project="CFM-TTS",
         wandb_run_name=exp_name,
         wandb_resume_id=wandb_resume_id,
+        last_per_steps=cfg.ckpts.last_per_steps,
         log_samples=True,
+        bnb_optimizer=cfg.optim.bnb_optimizer,
         mel_spec_type=mel_spec_type,
+        is_local_vocoder=cfg.model.mel_spec.is_local_vocoder,
+        local_vocoder_path=cfg.model.mel_spec.local_vocoder_path,
     )
+    train_dataset = load_dataset(cfg.datasets.name, tokenizer, mel_spec_kwargs=cfg.model.mel_spec)
     trainer.train(
         train_dataset,
+        num_workers=cfg.datasets.num_workers,
         resumable_with_seed=666,  # seed for shuffling dataset
     )