mrfakename commited on
Commit
b5979c9
1 Parent(s): b164af3

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

pyproject.toml CHANGED
@@ -39,6 +39,7 @@ dependencies = [
39
  "vocos",
40
  "wandb",
41
  "x_transformers>=1.31.14",
 
42
  ]
43
 
44
  [project.optional-dependencies]
 
39
  "vocos",
40
  "wandb",
41
  "x_transformers>=1.31.14",
42
+ "hydra-core>=1.3.0",
43
  ]
44
 
45
  [project.optional-dependencies]
src/f5_tts/configs/E2TTS_Base_train.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN # dataset name
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16 # number of workers
11
+
12
+ optim:
13
+ epochs: 15 # max epochs
14
+ learning_rate: 7.5e-5 # learning rate
15
+ num_warmup_updates: 20000 # warmup steps
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb optimizer or not
19
+
20
+ model:
21
+ name: E2TTS_Base # model name
22
+ tokenizer: pinyin # tokenizer type
23
+ tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
24
+ arch:
25
+ dim: 1024 # model dimension
26
+ depth: 24 # number of transformer layers
27
+ heads: 16 # number of transformer heads
28
+ ff_mult: 4 # ff layer expansion
29
+ mel_spec:
30
+ target_sample_rate: 24000 # target sample rate
31
+ n_mel_channels: 100 # mel channel
32
+ hop_length: 256 # hop length
33
+ win_length: 1024 # window length
34
+ n_fft: 1024 # fft length
35
+ mel_spec_type: vocos # 'vocos' or 'bigvgan'
36
+ is_local_vocoder: False # use local vocoder or not
37
+ local_vocoder_path: None # path to local vocoder
38
+
39
+ ckpts:
40
+ logger: wandb # wandb | tensorboard | None
41
+ save_per_updates: 50000 # save checkpoint per steps
42
+ last_per_steps: 5000 # save last checkpoint per steps
43
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
src/f5_tts/configs/E2TTS_Small_train.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16 # number of workers
11
+
12
+ optim:
13
+ epochs: 15
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup steps
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0
18
+ bnb_optimizer: False
19
+
20
+ model:
21
+ name: E2TTS_Small
22
+ tokenizer: pinyin
23
+ tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
24
+ arch:
25
+ dim: 768
26
+ depth: 20
27
+ heads: 12
28
+ ff_mult: 4
29
+ mel_spec:
30
+ target_sample_rate: 24000
31
+ n_mel_channels: 100
32
+ hop_length: 256
33
+ win_length: 1024
34
+ n_fft: 1024
35
+ mel_spec_type: vocos # 'vocos' or 'bigvgan'
36
+ is_local_vocoder: False
37
+ local_vocoder_path: None
38
+
39
+ ckpts:
40
+ logger: wandb # wandb | tensorboard | None
41
+ save_per_updates: 50000 # save checkpoint per steps
42
+ last_per_steps: 5000 # save last checkpoint per steps
43
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
src/f5_tts/configs/F5TTS_Base_train.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN # dataset name
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16 # number of workers
11
+
12
+ optim:
13
+ epochs: 15 # max epochs
14
+ learning_rate: 7.5e-5 # learning rate
15
+ num_warmup_updates: 20000 # warmup steps
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb optimizer or not
19
+
20
+ model:
21
+ name: F5TTS_Base # model name
22
+ tokenizer: pinyin # tokenizer type
23
+ tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
24
+ arch:
25
+ dim: 1024 # model dim
26
+ depth: 22 # model depth
27
+ heads: 16 # model heads
28
+ ff_mult: 2 # feedforward expansion
29
+ text_dim: 512 # text encoder dim
30
+ conv_layers: 4 # convolution layers
31
+ mel_spec:
32
+ target_sample_rate: 24000 # target sample rate
33
+ n_mel_channels: 100 # mel channel
34
+ hop_length: 256 # hop length
35
+ win_length: 1024 # window length
36
+ n_fft: 1024 # fft length
37
+ mel_spec_type: vocos # 'vocos' or 'bigvgan'
38
+ is_local_vocoder: False # use local vocoder or not
39
+ local_vocoder_path: None # local vocoder path
40
+
41
+ ckpts:
42
+ logger: wandb # wandb | tensorboard | None
43
+ save_per_updates: 50000 # save checkpoint per steps
44
+ last_per_steps: 5000 # save last checkpoint per steps
45
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
src/f5_tts/configs/F5TTS_Small_train.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # "frame" or "sample"
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16 # number of workers
11
+
12
+ optim:
13
+ epochs: 15
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup steps
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0
18
+ bnb_optimizer: False
19
+
20
+ model:
21
+ name: F5TTS_Small
22
+ tokenizer: pinyin
23
+ tokenizer_path: None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
24
+ arch:
25
+ dim: 768
26
+ depth: 18
27
+ heads: 12
28
+ ff_mult: 2
29
+ text_dim: 512
30
+ conv_layers: 4
31
+ mel_spec:
32
+ target_sample_rate: 24000
33
+ n_mel_channels: 100
34
+ hop_length: 256
35
+ win_length: 1024
36
+ n_fft: 1024
37
+ mel_spec_type: vocos # 'vocos' or 'bigvgan'
38
+ is_local_vocoder: False
39
+ local_vocoder_path: None
40
+
41
+ ckpts:
42
+ logger: wandb # wandb | tensorboard | None
43
+ save_per_updates: 50000 # save checkpoint per steps
44
+ last_per_steps: 5000 # save last checkpoint per steps
45
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
src/f5_tts/eval/README.md CHANGED
@@ -42,8 +42,8 @@ Then update in the following scripts with the paths you put evaluation model ckp
42
  Update the path with your batch-inferenced results, and carry out WER / SIM evaluations:
43
  ```bash
44
  # Evaluation for Seed-TTS test set
45
- python src/f5_tts/eval/eval_seedtts_testset.py
46
 
47
  # Evaluation for LibriSpeech-PC test-clean (cross-sentence)
48
- python src/f5_tts/eval/eval_librispeech_test_clean.py
49
  ```
 
42
  Update the path with your batch-inferenced results, and carry out WER / SIM evaluations:
43
  ```bash
44
  # Evaluation for Seed-TTS test set
45
+ python src/f5_tts/eval/eval_seedtts_testset.py --gen_wav_dir <GEN_WAVE_DIR>
46
 
47
  # Evaluation for LibriSpeech-PC test-clean (cross-sentence)
48
+ python src/f5_tts/eval/eval_librispeech_test_clean.py --gen_wav_dir <GEN_WAVE_DIR> --librispeech_test_clean_path <TEST_CLEAN_PATH>
49
  ```
src/f5_tts/eval/eval_infer_batch.py CHANGED
@@ -34,8 +34,6 @@ win_length = 1024
34
  n_fft = 1024
35
  target_rms = 0.1
36
 
37
-
38
- tokenizer = "pinyin"
39
  rel_path = str(files("f5_tts").joinpath("../../"))
40
 
41
 
@@ -49,6 +47,7 @@ def main():
49
  parser.add_argument("-n", "--expname", required=True)
50
  parser.add_argument("-c", "--ckptstep", default=1200000, type=int)
51
  parser.add_argument("-m", "--mel_spec_type", default="vocos", type=str, choices=["bigvgan", "vocos"])
 
52
 
53
  parser.add_argument("-nfe", "--nfestep", default=32, type=int)
54
  parser.add_argument("-o", "--odemethod", default="euler")
@@ -64,6 +63,7 @@ def main():
64
  ckpt_step = args.ckptstep
65
  ckpt_path = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}.pt"
66
  mel_spec_type = args.mel_spec_type
 
67
 
68
  nfe_step = args.nfestep
69
  ode_method = args.odemethod
 
34
  n_fft = 1024
35
  target_rms = 0.1
36
 
 
 
37
  rel_path = str(files("f5_tts").joinpath("../../"))
38
 
39
 
 
47
  parser.add_argument("-n", "--expname", required=True)
48
  parser.add_argument("-c", "--ckptstep", default=1200000, type=int)
49
  parser.add_argument("-m", "--mel_spec_type", default="vocos", type=str, choices=["bigvgan", "vocos"])
50
+ parser.add_argument("-to", "--tokenizer", default="pinyin", type=str, choices=["pinyin", "char"])
51
 
52
  parser.add_argument("-nfe", "--nfestep", default=32, type=int)
53
  parser.add_argument("-o", "--odemethod", default="euler")
 
63
  ckpt_step = args.ckptstep
64
  ckpt_path = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}.pt"
65
  mel_spec_type = args.mel_spec_type
66
+ tokenizer = args.tokenizer
67
 
68
  nfe_step = args.nfestep
69
  ode_method = args.odemethod
src/f5_tts/eval/eval_librispeech_test_clean.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  import sys
4
  import os
 
5
 
6
  sys.path.append(os.getcwd())
7
 
@@ -19,55 +20,65 @@ from f5_tts.eval.utils_eval import (
19
  rel_path = str(files("f5_tts").joinpath("../../"))
20
 
21
 
22
- eval_task = "wer" # sim | wer
23
- lang = "en"
24
- metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
25
- librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean" # test-clean path
26
- gen_wav_dir = "PATH_TO_GENERATED" # generated wavs
27
-
28
- gpus = [0, 1, 2, 3, 4, 5, 6, 7]
29
- test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
30
-
31
- ## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
32
- ## leading to a low similarity for the ground truth in some cases.
33
- # test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = True) # eval ground truth
34
-
35
- local = False
36
- if local: # use local custom checkpoint dir
37
- asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
38
- else:
39
- asr_ckpt_dir = "" # auto download to cache dir
40
-
41
- wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
42
-
43
-
44
- # --------------------------- WER ---------------------------
45
-
46
- if eval_task == "wer":
47
- wers = []
48
-
49
- with mp.Pool(processes=len(gpus)) as pool:
50
- args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
51
- results = pool.map(run_asr_wer, args)
52
- for wers_ in results:
53
- wers.extend(wers_)
54
-
55
- wer = round(np.mean(wers) * 100, 3)
56
- print(f"\nTotal {len(wers)} samples")
57
- print(f"WER : {wer}%")
58
-
59
-
60
- # --------------------------- SIM ---------------------------
61
-
62
- if eval_task == "sim":
63
- sim_list = []
64
-
65
- with mp.Pool(processes=len(gpus)) as pool:
66
- args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
67
- results = pool.map(run_sim, args)
68
- for sim_ in results:
69
- sim_list.extend(sim_)
70
-
71
- sim = round(sum(sim_list) / len(sim_list), 3)
72
- print(f"\nTotal {len(sim_list)} samples")
73
- print(f"SIM : {sim}")
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  import sys
4
  import os
5
+ import argparse
6
 
7
  sys.path.append(os.getcwd())
8
 
 
20
  rel_path = str(files("f5_tts").joinpath("../../"))
21
 
22
 
23
+ def get_args():
24
+ parser = argparse.ArgumentParser()
25
+ parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
26
+ parser.add_argument("-l", "--lang", type=str, default="en")
27
+ parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
28
+ parser.add_argument("-p", "--librispeech_test_clean_path", type=str, required=True)
29
+ parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
30
+ parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
31
+ return parser.parse_args()
32
+
33
+
34
+ def main():
35
+ args = get_args()
36
+ eval_task = args.eval_task
37
+ lang = args.lang
38
+ librispeech_test_clean_path = args.librispeech_test_clean_path # test-clean path
39
+ gen_wav_dir = args.gen_wav_dir
40
+ metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
41
+
42
+ gpus = list(range(args.gpu_nums))
43
+ test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
44
+
45
+ ## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
46
+ ## leading to a low similarity for the ground truth in some cases.
47
+ # test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = True) # eval ground truth
48
+
49
+ local = args.local
50
+ if local: # use local custom checkpoint dir
51
+ asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
52
+ else:
53
+ asr_ckpt_dir = "" # auto download to cache dir
54
+ wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
55
+
56
+ # --------------------------- WER ---------------------------
57
+ if eval_task == "wer":
58
+ wers = []
59
+ with mp.Pool(processes=len(gpus)) as pool:
60
+ args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
61
+ results = pool.map(run_asr_wer, args)
62
+ for wers_ in results:
63
+ wers.extend(wers_)
64
+
65
+ wer = round(np.mean(wers) * 100, 3)
66
+ print(f"\nTotal {len(wers)} samples")
67
+ print(f"WER : {wer}%")
68
+
69
+ # --------------------------- SIM ---------------------------
70
+ if eval_task == "sim":
71
+ sim_list = []
72
+ with mp.Pool(processes=len(gpus)) as pool:
73
+ args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
74
+ results = pool.map(run_sim, args)
75
+ for sim_ in results:
76
+ sim_list.extend(sim_)
77
+
78
+ sim = round(sum(sim_list) / len(sim_list), 3)
79
+ print(f"\nTotal {len(sim_list)} samples")
80
+ print(f"SIM : {sim}")
81
+
82
+
83
+ if __name__ == "__main__":
84
+ main()
src/f5_tts/eval/eval_seedtts_testset.py CHANGED
@@ -2,6 +2,7 @@
2
 
3
  import sys
4
  import os
 
5
 
6
  sys.path.append(os.getcwd())
7
 
@@ -19,57 +20,65 @@ from f5_tts.eval.utils_eval import (
19
  rel_path = str(files("f5_tts").joinpath("../../"))
20
 
21
 
22
- eval_task = "wer" # sim | wer
23
- lang = "zh" # zh | en
24
- metalst = rel_path + f"/data/seedtts_testset/{lang}/meta.lst" # seed-tts testset
25
- # gen_wav_dir = rel_path + f"/data/seedtts_testset/{lang}/wavs" # ground truth wavs
26
- gen_wav_dir = "PATH_TO_GENERATED" # generated wavs
27
-
28
-
29
- # NOTE. paraformer-zh result will be slightly different according to the number of gpus, cuz batchsize is different
30
- # zh 1.254 seems a result of 4 workers wer_seed_tts
31
- gpus = [0, 1, 2, 3, 4, 5, 6, 7]
32
- test_set = get_seed_tts_test(metalst, gen_wav_dir, gpus)
33
-
34
- local = False
35
- if local: # use local custom checkpoint dir
36
- if lang == "zh":
37
- asr_ckpt_dir = "../checkpoints/funasr" # paraformer-zh dir under funasr
38
- elif lang == "en":
39
- asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
40
- else:
41
- asr_ckpt_dir = "" # auto download to cache dir
42
-
43
- wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
44
-
45
-
46
- # --------------------------- WER ---------------------------
47
-
48
- if eval_task == "wer":
49
- wers = []
50
-
51
- with mp.Pool(processes=len(gpus)) as pool:
52
- args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
53
- results = pool.map(run_asr_wer, args)
54
- for wers_ in results:
55
- wers.extend(wers_)
56
-
57
- wer = round(np.mean(wers) * 100, 3)
58
- print(f"\nTotal {len(wers)} samples")
59
- print(f"WER : {wer}%")
60
-
61
-
62
- # --------------------------- SIM ---------------------------
63
-
64
- if eval_task == "sim":
65
- sim_list = []
66
-
67
- with mp.Pool(processes=len(gpus)) as pool:
68
- args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
69
- results = pool.map(run_sim, args)
70
- for sim_ in results:
71
- sim_list.extend(sim_)
72
-
73
- sim = round(sum(sim_list) / len(sim_list), 3)
74
- print(f"\nTotal {len(sim_list)} samples")
75
- print(f"SIM : {sim}")
 
 
 
 
 
 
 
 
 
2
 
3
  import sys
4
  import os
5
+ import argparse
6
 
7
  sys.path.append(os.getcwd())
8
 
 
20
  rel_path = str(files("f5_tts").joinpath("../../"))
21
 
22
 
23
+ def get_args():
24
+ parser = argparse.ArgumentParser()
25
+ parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
26
+ parser.add_argument("-l", "--lang", type=str, default="en", choices=["zh", "en"])
27
+ parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
28
+ parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
29
+ parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
30
+ return parser.parse_args()
31
+
32
+
33
+ def main():
34
+ args = get_args()
35
+ eval_task = args.eval_task
36
+ lang = args.lang
37
+ gen_wav_dir = args.gen_wav_dir
38
+ metalst = rel_path + f"/data/seedtts_testset/{lang}/meta.lst" # seed-tts testset
39
+
40
+ # NOTE. paraformer-zh result will be slightly different according to the number of gpus, cuz batchsize is different
41
+ # zh 1.254 seems a result of 4 workers wer_seed_tts
42
+ gpus = list(range(args.gpu_nums))
43
+ test_set = get_seed_tts_test(metalst, gen_wav_dir, gpus)
44
+
45
+ local = args.local
46
+ if local: # use local custom checkpoint dir
47
+ if lang == "zh":
48
+ asr_ckpt_dir = "../checkpoints/funasr" # paraformer-zh dir under funasr
49
+ elif lang == "en":
50
+ asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
51
+ else:
52
+ asr_ckpt_dir = "" # auto download to cache dir
53
+ wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
54
+
55
+ # --------------------------- WER ---------------------------
56
+
57
+ if eval_task == "wer":
58
+ wers = []
59
+ with mp.Pool(processes=len(gpus)) as pool:
60
+ args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
61
+ results = pool.map(run_asr_wer, args)
62
+ for wers_ in results:
63
+ wers.extend(wers_)
64
+
65
+ wer = round(np.mean(wers) * 100, 3)
66
+ print(f"\nTotal {len(wers)} samples")
67
+ print(f"WER : {wer}%")
68
+
69
+ # --------------------------- SIM ---------------------------
70
+ if eval_task == "sim":
71
+ sim_list = []
72
+ with mp.Pool(processes=len(gpus)) as pool:
73
+ args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
74
+ results = pool.map(run_sim, args)
75
+ for sim_ in results:
76
+ sim_list.extend(sim_)
77
+
78
+ sim = round(sum(sim_list) / len(sim_list), 3)
79
+ print(f"\nTotal {len(sim_list)} samples")
80
+ print(f"SIM : {sim}")
81
+
82
+
83
+ if __name__ == "__main__":
84
+ main()
src/f5_tts/train/README.md CHANGED
@@ -16,6 +16,9 @@ python src/f5_tts/train/datasets/prepare_wenetspeech4tts.py
16
 
17
  # Prepare the LibriTTS dataset
18
  python src/f5_tts/train/datasets/prepare_libritts.py
 
 
 
19
  ```
20
 
21
  ### 2. Create custom dataset with metadata.csv
@@ -35,7 +38,7 @@ Once your datasets are prepared, you can start the training process.
35
  # setup accelerate config, e.g. use multi-gpu ddp, fp16
36
  # will be to: ~/.cache/huggingface/accelerate/default_config.yaml
37
  accelerate config
38
- accelerate launch src/f5_tts/train/train.py
39
  ```
40
 
41
  ### 2. Finetuning practice
 
16
 
17
  # Prepare the LibriTTS dataset
18
  python src/f5_tts/train/datasets/prepare_libritts.py
19
+
20
+ # Prepare the LJSpeech dataset
21
+ python src/f5_tts/train/datasets/prepare_ljspeech.py
22
  ```
23
 
24
  ### 2. Create custom dataset with metadata.csv
 
38
  # setup accelerate config, e.g. use multi-gpu ddp, fp16
39
  # will be to: ~/.cache/huggingface/accelerate/default_config.yaml
40
  accelerate config
41
+ accelerate launch src/f5_tts/train/train.py --config-name F5TTS_Base_train.yaml # F5TTS_Base_train.yaml | E2TTS_Base_train.yaml
42
  ```
43
 
44
  ### 2. Finetuning practice
src/f5_tts/train/datasets/prepare_ljspeech.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ sys.path.append(os.getcwd())
5
+
6
+ import json
7
+ from importlib.resources import files
8
+ from pathlib import Path
9
+ from tqdm import tqdm
10
+ import soundfile as sf
11
+ from datasets.arrow_writer import ArrowWriter
12
+
13
+
14
+ def main():
15
+ result = []
16
+ duration_list = []
17
+ text_vocab_set = set()
18
+
19
+ with open(meta_info, "r") as f:
20
+ lines = f.readlines()
21
+ for line in tqdm(lines):
22
+ uttr, text, norm_text = line.split("|")
23
+ wav_path = Path(dataset_dir) / "wavs" / f"{uttr}.wav"
24
+ duration = sf.info(wav_path).duration
25
+ if duration < 0.4 or duration > 30:
26
+ continue
27
+ result.append({"audio_path": str(wav_path), "text": norm_text, "duration": duration})
28
+ duration_list.append(duration)
29
+ text_vocab_set.update(list(norm_text))
30
+
31
+ # save preprocessed dataset to disk
32
+ if not os.path.exists(f"{save_dir}"):
33
+ os.makedirs(f"{save_dir}")
34
+ print(f"\nSaving to {save_dir} ...")
35
+
36
+ with ArrowWriter(path=f"{save_dir}/raw.arrow") as writer:
37
+ for line in tqdm(result, desc="Writing to raw.arrow ..."):
38
+ writer.write(line)
39
+
40
+ # dup a json separately saving duration in case for DynamicBatchSampler ease
41
+ with open(f"{save_dir}/duration.json", "w", encoding="utf-8") as f:
42
+ json.dump({"duration": duration_list}, f, ensure_ascii=False)
43
+
44
+ # vocab map, i.e. tokenizer
45
+ # add alphabets and symbols (optional, if plan to ft on de/fr etc.)
46
+ with open(f"{save_dir}/vocab.txt", "w") as f:
47
+ for vocab in sorted(text_vocab_set):
48
+ f.write(vocab + "\n")
49
+
50
+ print(f"\nFor {dataset_name}, sample count: {len(result)}")
51
+ print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}")
52
+ print(f"For {dataset_name}, total {sum(duration_list)/3600:.2f} hours")
53
+
54
+
55
+ if __name__ == "__main__":
56
+ tokenizer = "char" # "pinyin" | "char"
57
+
58
+ dataset_dir = "<SOME_PATH>/LJSpeech-1.1"
59
+ dataset_name = f"LJSpeech_{tokenizer}"
60
+ meta_info = os.path.join(dataset_dir, "metadata.csv")
61
+ save_dir = str(files("f5_tts").joinpath("../../")) + f"/data/{dataset_name}"
62
+ print(f"\nPrepare for {dataset_name}, will save to {save_dir}\n")
63
+
64
+ main()
src/f5_tts/train/train.py CHANGED
@@ -1,100 +1,71 @@
1
  # training script.
2
-
3
  from importlib.resources import files
4
 
 
 
5
  from f5_tts.model import CFM, DiT, Trainer, UNetT
6
  from f5_tts.model.dataset import load_dataset
7
  from f5_tts.model.utils import get_tokenizer
8
 
9
- # -------------------------- Dataset Settings --------------------------- #
10
-
11
- target_sample_rate = 24000
12
- n_mel_channels = 100
13
- hop_length = 256
14
- win_length = 1024
15
- n_fft = 1024
16
- mel_spec_type = "vocos" # 'vocos' or 'bigvgan'
17
-
18
- tokenizer = "pinyin" # 'pinyin', 'char', or 'custom'
19
- tokenizer_path = None # if tokenizer = 'custom', define the path to the tokenizer you want to use (should be vocab.txt)
20
- dataset_name = "Emilia_ZH_EN"
21
-
22
- # -------------------------- Training Settings -------------------------- #
23
-
24
- exp_name = "F5TTS_Base" # F5TTS_Base | E2TTS_Base
25
-
26
- learning_rate = 7.5e-5
27
-
28
- batch_size_per_gpu = 38400 # 8 GPUs, 8 * 38400 = 307200
29
- batch_size_type = "frame" # "frame" or "sample"
30
- max_samples = 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
31
- grad_accumulation_steps = 1 # note: updates = steps / grad_accumulation_steps
32
- max_grad_norm = 1.0
33
 
34
- epochs = 11 # use linear decay, thus epochs control the slope
35
- num_warmup_updates = 20000 # warmup steps
36
- save_per_updates = 50000 # save checkpoint per steps
37
- last_per_steps = 5000 # save last checkpoint per steps
38
 
39
- # model params
40
- if exp_name == "F5TTS_Base":
41
- wandb_resume_id = None
42
- model_cls = DiT
43
- model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
44
- elif exp_name == "E2TTS_Base":
45
- wandb_resume_id = None
46
- model_cls = UNetT
47
- model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
48
-
49
-
50
- # ----------------------------------------------------------------------- #
51
 
52
-
53
- def main():
54
- if tokenizer == "custom":
55
- tokenizer_path = tokenizer_path
56
  else:
57
- tokenizer_path = dataset_name
58
  vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
59
 
60
- mel_spec_kwargs = dict(
61
- n_fft=n_fft,
62
- hop_length=hop_length,
63
- win_length=win_length,
64
- n_mel_channels=n_mel_channels,
65
- target_sample_rate=target_sample_rate,
66
- mel_spec_type=mel_spec_type,
67
- )
68
 
69
  model = CFM(
70
- transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
71
- mel_spec_kwargs=mel_spec_kwargs,
72
  vocab_char_map=vocab_char_map,
73
  )
74
 
 
75
  trainer = Trainer(
76
  model,
77
- epochs,
78
- learning_rate,
79
- num_warmup_updates=num_warmup_updates,
80
- save_per_updates=save_per_updates,
81
- checkpoint_path=str(files("f5_tts").joinpath(f"../../ckpts/{exp_name}")),
82
- batch_size=batch_size_per_gpu,
83
- batch_size_type=batch_size_type,
84
- max_samples=max_samples,
85
- grad_accumulation_steps=grad_accumulation_steps,
86
- max_grad_norm=max_grad_norm,
 
87
  wandb_project="CFM-TTS",
88
  wandb_run_name=exp_name,
89
  wandb_resume_id=wandb_resume_id,
90
- last_per_steps=last_per_steps,
91
  log_samples=True,
 
92
  mel_spec_type=mel_spec_type,
 
 
93
  )
94
 
95
- train_dataset = load_dataset(dataset_name, tokenizer, mel_spec_kwargs=mel_spec_kwargs)
96
  trainer.train(
97
  train_dataset,
 
98
  resumable_with_seed=666, # seed for shuffling dataset
99
  )
100
 
 
1
  # training script.
2
+ import os
3
  from importlib.resources import files
4
 
5
+ import hydra
6
+
7
  from f5_tts.model import CFM, DiT, Trainer, UNetT
8
  from f5_tts.model.dataset import load_dataset
9
  from f5_tts.model.utils import get_tokenizer
10
 
11
+ os.chdir(str(files("f5_tts").joinpath("../..")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
 
 
 
 
13
 
14
+ @hydra.main(version_base="1.3", config_path=str(files("f5_tts").joinpath("configs")), config_name=None)
15
+ def main(cfg):
16
+ tokenizer = cfg.model.tokenizer
17
+ mel_spec_type = cfg.model.mel_spec.mel_spec_type
18
+ exp_name = f"{cfg.model.name}_{mel_spec_type}_{cfg.model.tokenizer}_{cfg.datasets.name}"
 
 
 
 
 
 
 
19
 
20
+ # set text tokenizer
21
+ if tokenizer != "custom":
22
+ tokenizer_path = cfg.datasets.name
 
23
  else:
24
+ tokenizer_path = cfg.model.tokenizer_path
25
  vocab_char_map, vocab_size = get_tokenizer(tokenizer_path, tokenizer)
26
 
27
+ # set model
28
+ if "F5TTS" in cfg.model.name:
29
+ model_cls = DiT
30
+ elif "E2TTS" in cfg.model.name:
31
+ model_cls = UNetT
32
+ wandb_resume_id = None
 
 
33
 
34
  model = CFM(
35
+ transformer=model_cls(**cfg.model.arch, text_num_embeds=vocab_size, mel_dim=cfg.model.mel_spec.n_mel_channels),
36
+ mel_spec_kwargs=cfg.model.mel_spec,
37
  vocab_char_map=vocab_char_map,
38
  )
39
 
40
+ # init trainer
41
  trainer = Trainer(
42
  model,
43
+ epochs=cfg.optim.epochs,
44
+ learning_rate=cfg.optim.learning_rate,
45
+ num_warmup_updates=cfg.optim.num_warmup_updates,
46
+ save_per_updates=cfg.ckpts.save_per_updates,
47
+ checkpoint_path=str(files("f5_tts").joinpath(f"../../{cfg.ckpts.save_dir}")),
48
+ batch_size=cfg.datasets.batch_size_per_gpu,
49
+ batch_size_type=cfg.datasets.batch_size_type,
50
+ max_samples=cfg.datasets.max_samples,
51
+ grad_accumulation_steps=cfg.optim.grad_accumulation_steps,
52
+ max_grad_norm=cfg.optim.max_grad_norm,
53
+ logger=cfg.ckpts.logger,
54
  wandb_project="CFM-TTS",
55
  wandb_run_name=exp_name,
56
  wandb_resume_id=wandb_resume_id,
57
+ last_per_steps=cfg.ckpts.last_per_steps,
58
  log_samples=True,
59
+ bnb_optimizer=cfg.optim.bnb_optimizer,
60
  mel_spec_type=mel_spec_type,
61
+ is_local_vocoder=cfg.model.mel_spec.is_local_vocoder,
62
+ local_vocoder_path=cfg.model.mel_spec.local_vocoder_path,
63
  )
64
 
65
+ train_dataset = load_dataset(cfg.datasets.name, tokenizer, mel_spec_kwargs=cfg.model.mel_spec)
66
  trainer.train(
67
  train_dataset,
68
+ num_workers=cfg.datasets.num_workers,
69
  resumable_with_seed=666, # seed for shuffling dataset
70
  )
71