yerfor commited on
Commit
22871e7
1 Parent(s): 2874937
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +4 -6
  2. data/binary/ljspeech/phone_set.json +1 -0
  3. data/binary/ljspeech/spk_map.json +1 -0
  4. data/binary/ljspeech/word_set.json +0 -0
  5. egs/datasets/audio/biaobei/__pycache__/preprocess.cpython-36.pyc +0 -0
  6. egs/datasets/audio/biaobei/base_text2mel.yaml +18 -0
  7. egs/datasets/audio/biaobei/preprocess.py +16 -0
  8. egs/datasets/audio/biaobei/ps_flow.yaml +3 -0
  9. egs/datasets/audio/biaobei/synta.yaml +19 -0
  10. egs/datasets/audio/libritts/__pycache__/preprocess.cpython-36.pyc +0 -0
  11. egs/datasets/audio/libritts/base_text2mel.yaml +22 -0
  12. egs/datasets/audio/libritts/preprocess.py +13 -0
  13. egs/datasets/audio/libritts/ps_flow.yaml +3 -0
  14. egs/datasets/audio/libritts/synta.yaml +19 -0
  15. egs/datasets/audio/lj/__pycache__/preprocess.cpython-36.pyc +0 -0
  16. egs/datasets/audio/lj/base_mel2wav.yaml +4 -0
  17. egs/datasets/audio/lj/base_text2mel.yaml +17 -0
  18. egs/datasets/audio/lj/ds.yaml +29 -0
  19. egs/datasets/audio/lj/fs.yaml +3 -0
  20. egs/datasets/audio/lj/fs2_orig.yaml +4 -0
  21. egs/datasets/audio/lj/hifigan.yaml +3 -0
  22. egs/datasets/audio/lj/preprocess.py +9 -0
  23. egs/datasets/audio/lj/ps_flow.yaml +3 -0
  24. egs/datasets/audio/lj/ps_flow_nips2021.yaml +11 -0
  25. egs/datasets/audio/lj/ps_flow_small.yaml +3 -0
  26. egs/datasets/audio/lj/ps_flow_small_nips2021.yaml +11 -0
  27. egs/datasets/audio/lj/synta.yaml +19 -0
  28. egs/egs_bases/config_base.yaml +41 -0
  29. egs/egs_bases/tts/base.yaml +56 -0
  30. egs/egs_bases/tts/base_zh.yaml +5 -0
  31. egs/egs_bases/tts/dataset_params.yaml +52 -0
  32. egs/egs_bases/tts/ds.yaml +33 -0
  33. egs/egs_bases/tts/fs.yaml +75 -0
  34. egs/egs_bases/tts/fs2_orig.yaml +13 -0
  35. egs/egs_bases/tts/ps.yaml +63 -0
  36. egs/egs_bases/tts/ps_flow.yaml +20 -0
  37. egs/egs_bases/tts/ps_flow_small.yaml +42 -0
  38. egs/egs_bases/tts/synta.yaml +20 -0
  39. egs/egs_bases/tts/vocoder/base.yaml +20 -0
  40. egs/egs_bases/tts/vocoder/hifigan.yaml +28 -0
  41. inference/tts/__pycache__/base_tts_infer.cpython-36.pyc +0 -0
  42. inference/tts/base_tts_infer.py +120 -0
  43. inference/tts/ds.py +30 -0
  44. inference/tts/fs.py +29 -0
  45. inference/tts/fs2_orig.py +17 -0
  46. inference/tts/gradio/gradio_settings.yaml +12 -0
  47. inference/tts/gradio/infer.py +69 -0
  48. inference/tts/ps_flow.py +39 -0
  49. inference/tts/synta.py +76 -0
  50. modules/commons/__pycache__/conv.cpython-36.pyc +0 -0
README.md CHANGED
@@ -1,12 +1,10 @@
1
  ---
2
  title: SyntaSpeech
3
- emoji: 📊
4
- colorFrom: red
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 2.9.4
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
 
1
  ---
2
  title: SyntaSpeech
3
+ emoji: 🤗
4
+ colorFrom: yellow
5
+ colorTo: orange
6
  sdk: gradio
7
+ app_file: "inference/tts/gradio/infer.py"
 
8
  pinned: false
9
  ---
10
 
 
data/binary/ljspeech/phone_set.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH"]
data/binary/ljspeech/spk_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<SINGLE_SPK>": 0}
data/binary/ljspeech/word_set.json ADDED
The diff for this file is too large to render. See raw diff
 
egs/datasets/audio/biaobei/__pycache__/preprocess.cpython-36.pyc ADDED
Binary file (1.12 kB). View file
 
egs/datasets/audio/biaobei/base_text2mel.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: egs/egs_bases/tts/base_zh.yaml
2
+ raw_data_dir: 'data/raw/biaobei'
3
+ processed_data_dir: 'data/processed/biaobei'
4
+ binary_data_dir: 'data/binary/biaobei'
5
+ preprocess_cls: egs.datasets.audio.biaobei.preprocess.BiaobeiPreprocess
6
+
7
+ ds_name: biaobei
8
+ binarization_args:
9
+ train_range: [ 871, -1 ]
10
+ test_range: [ 0, 523 ]
11
+ valid_range: [ 523, 871 ]
12
+ test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
13
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
14
+ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
15
+ 316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
16
+ f0_min: 80
17
+ f0_max: 600
18
+ vocoder_ckpt: checkpoints/hifi_biaobei
egs/datasets/audio/biaobei/preprocess.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from data_gen.tts.base_preprocess import BasePreprocessor
2
+ import re
3
+
4
+
5
+ class BiaobeiPreprocess(BasePreprocessor):
6
+ def meta_data(self):
7
+ input_dir = self.raw_data_dir
8
+ with open(f"{input_dir}/ProsodyLabeling/000001-010000.txt", encoding='utf-8') as f:
9
+ bb_lines = f.readlines()[::2]
10
+ for l_idx, l in (enumerate([re.sub("\#\d+", "", l.split('\t')[1].strip()) for l in bb_lines])):
11
+ item_name = f'{l_idx + 1:06d}'
12
+ wav_fn = f"{input_dir}/wav/{l_idx + 1:06d}.wav"
13
+ yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': l}
14
+
15
+ if __name__ == "__main__":
16
+ BiaobeiPreprocess().process()
egs/datasets/audio/biaobei/ps_flow.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/ps_flow.yaml
3
+ - ./base_text2mel.yaml
egs/datasets/audio/biaobei/synta.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/synta.yaml
3
+ - ./base_text2mel.yaml
4
+
5
+ lambda_mel_adv: 0.05
6
+
7
+ disc_win_num: 3
8
+ mel_disc_hidden_size: 128
9
+ disc_norm: in
10
+ disc_reduction: stack
11
+ disc_interval: 1
12
+ disc_lr: 0.0001
13
+ disc_start_steps: 0
14
+ discriminator_scheduler_params:
15
+ gamma: 0.5
16
+ step_size: 40000
17
+ discriminator_optimizer_params:
18
+ eps: 1.0e-06
19
+ weight_decay: 0.0
egs/datasets/audio/libritts/__pycache__/preprocess.cpython-36.pyc ADDED
Binary file (893 Bytes). View file
 
egs/datasets/audio/libritts/base_text2mel.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ds_name: libritts
2
+ base_config: egs/egs_bases/tts/base.yaml
3
+ raw_data_dir: 'data/raw/LibriTTS'
4
+ processed_data_dir: 'data/processed/libritts'
5
+ binary_data_dir: 'data/binary/libritts'
6
+ preprocess_cls: egs.datasets.audio.libritts.preprocess.LibriTTSPreprocess
7
+ binarization_args:
8
+ train_range: [ 871, -1 ]
9
+ test_range: [ 0, 523 ]
10
+ valid_range: [ 523, 871 ]
11
+ shuffle: false
12
+ with_spk_id: true
13
+ with_spk_embed: false
14
+ test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
15
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
16
+ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
17
+ 316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
18
+ f0_min: 80
19
+ f0_max: 600
20
+ vocoder: PWG
21
+ vocoder_ckpt: checkpoints/pwg_libritts
22
+ num_spk: 2000
egs/datasets/audio/libritts/preprocess.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from data_gen.tts.base_preprocess import BasePreprocessor
2
+ import glob, os
3
+
4
+ class LibriTTSPreprocess(BasePreprocessor):
5
+ def meta_data(self):
6
+ wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*/*.wav'))
7
+ for wav_fn in wav_fns:
8
+ item_name = os.path.basename(wav_fn)[:-4]
9
+ txt_fn = f'{wav_fn[:-4]}.normalized.txt'
10
+ with open(txt_fn, 'r') as f:
11
+ txt = f.read()
12
+ spk_name = item_name.split("_")[0]
13
+ yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt, 'spk_name': spk_name}
egs/datasets/audio/libritts/ps_flow.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/ps_flow.yaml
3
+ - ./base_text2mel.yaml
egs/datasets/audio/libritts/synta.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/synta.yaml
3
+ - ./base_text2mel.yaml
4
+
5
+ lambda_mel_adv: 0.05
6
+
7
+ disc_win_num: 3
8
+ mel_disc_hidden_size: 128
9
+ disc_norm: in
10
+ disc_reduction: stack
11
+ disc_interval: 1
12
+ disc_lr: 0.0001
13
+ disc_start_steps: 0
14
+ discriminator_scheduler_params:
15
+ gamma: 0.5
16
+ step_size: 40000
17
+ discriminator_optimizer_params:
18
+ eps: 1.0e-06
19
+ weight_decay: 0.0
egs/datasets/audio/lj/__pycache__/preprocess.cpython-36.pyc ADDED
Binary file (711 Bytes). View file
 
egs/datasets/audio/lj/base_mel2wav.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ base_config: egs/egs_bases/tts/vocoder/base.yaml
2
+ raw_data_dir: 'data/raw/LJSpeech-1.1'
3
+ processed_data_dir: 'data/processed/ljspeech'
4
+ binary_data_dir: 'data/binary/ljspeech_wav'
egs/datasets/audio/lj/base_text2mel.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ds_name: ljspeech
2
+ base_config: egs/egs_bases/tts/base.yaml
3
+ raw_data_dir: 'data/raw/LJSpeech-1.1'
4
+ processed_data_dir: 'data/processed/ljspeech'
5
+ binary_data_dir: 'data/binary/ljspeech'
6
+ preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
7
+ binarization_args:
8
+ train_range: [ 871, -1 ]
9
+ test_range: [ 0, 523 ]
10
+ valid_range: [ 523, 871 ]
11
+ test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
12
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
13
+ 68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
14
+ 316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
15
+ f0_min: 80
16
+ f0_max: 600
17
+ vocoder_ckpt: checkpoints/hifi_lj
egs/datasets/audio/lj/ds.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/ds.yaml
3
+ - ./fs2_orig.yaml
4
+
5
+ fs2_ckpt: checkpoints/aux_exp/model_ckpt_steps_100000.ckpt
6
+
7
+ # spec_min and spec_max are calculated on the training set.
8
+ spec_min: [ -4.7574, -4.6783, -4.6431, -4.5832, -4.5390, -4.6771, -4.8089, -4.7672,
9
+ -4.5784, -4.7755, -4.7150, -4.8919, -4.8271, -4.7389, -4.6047, -4.7759,
10
+ -4.6799, -4.8201, -4.7823, -4.8262, -4.7857, -4.7545, -4.9358, -4.9733,
11
+ -5.1134, -5.1395, -4.9016, -4.8434, -5.0189, -4.8460, -5.0529, -4.9510,
12
+ -5.0217, -5.0049, -5.1831, -5.1445, -5.1015, -5.0281, -4.9887, -4.9916,
13
+ -4.9785, -4.9071, -4.9488, -5.0342, -4.9332, -5.0650, -4.8924, -5.0875,
14
+ -5.0483, -5.0848, -5.0655, -5.0279, -5.0015, -5.0792, -5.0636, -5.2413,
15
+ -5.1421, -5.1710, -5.3256, -5.0511, -5.1186, -5.0057, -5.0446, -5.1173,
16
+ -5.0325, -5.1085, -5.0053, -5.0755, -5.1176, -5.1004, -5.2153, -5.2757,
17
+ -5.3025, -5.2867, -5.2918, -5.3328, -5.2731, -5.2985, -5.2400, -5.2211 ]
18
+ spec_max: [ -0.5982, -0.0778, 0.1205, 0.2747, 0.4657, 0.5123, 0.5830, 0.7093,
19
+ 0.6461, 0.6101, 0.7316, 0.7715, 0.7681, 0.8349, 0.7815, 0.7591,
20
+ 0.7910, 0.7433, 0.7352, 0.6869, 0.6854, 0.6623, 0.5353, 0.6492,
21
+ 0.6909, 0.6106, 0.5761, 0.5236, 0.5638, 0.4054, 0.4545, 0.3407,
22
+ 0.3037, 0.3380, 0.1599, 0.1603, 0.2741, 0.2130, 0.1569, 0.1911,
23
+ 0.2324, 0.1586, 0.1221, 0.0341, -0.0558, 0.0553, -0.1153, -0.0933,
24
+ -0.1171, -0.0050, -0.1519, -0.1629, -0.0522, -0.0739, -0.2069, -0.2405,
25
+ -0.1244, -0.2582, -0.1361, -0.1575, -0.1442, 0.0513, -0.1567, -0.2000,
26
+ 0.0086, -0.0698, 0.1385, 0.0941, 0.1864, 0.1225, 0.1389, 0.1382,
27
+ 0.1670, 0.1007, 0.1444, 0.0888, 0.1998, 0.2280, 0.2932, 0.3047 ]
28
+
29
+ max_tokens: 30000
egs/datasets/audio/lj/fs.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/fs.yaml
3
+ - ./base_text2mel.yaml
egs/datasets/audio/lj/fs2_orig.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/fs2_orig.yaml
3
+ - ./base_text2mel.yaml
4
+ binary_data_dir: 'data/binary/ljspeech_cwt'
egs/datasets/audio/lj/hifigan.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/vocoder/hifigan.yaml
3
+ - ./base_mel2wav.yaml
egs/datasets/audio/lj/preprocess.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from data_gen.tts.base_preprocess import BasePreprocessor
2
+
3
+
4
+ class LJPreprocess(BasePreprocessor):
5
+ def meta_data(self):
6
+ for l in open(f'{self.raw_data_dir}/metadata.csv').readlines():
7
+ item_name, _, txt = l.strip().split("|")
8
+ wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav"
9
+ yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt}
egs/datasets/audio/lj/ps_flow.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/ps_flow.yaml
3
+ - ./base_text2mel.yaml
egs/datasets/audio/lj/ps_flow_nips2021.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - ./ps_flow.yaml
3
+ max_sentences: 64
4
+ dur_level: word
5
+ use_word_encoder: false
6
+ enc_prenet: true
7
+ enc_pre_ln: false
8
+ fvae_encoder_type: wn
9
+ fvae_decoder_type: wn
10
+ text_encoder_postnet: false
11
+ warmup_updates: 8000
egs/datasets/audio/lj/ps_flow_small.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/ps_flow_small.yaml
3
+ - ./base_text2mel.yaml
egs/datasets/audio/lj/ps_flow_small_nips2021.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - ./ps_flow_small.yaml
3
+ max_sentences: 128
4
+ dur_level: word
5
+ use_word_encoder: false
6
+ enc_prenet: true
7
+ enc_pre_ln: false
8
+ fvae_encoder_type: wn
9
+ fvae_decoder_type: wn
10
+ text_encoder_postnet: false
11
+ warmup_updates: 8000
egs/datasets/audio/lj/synta.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/tts/synta.yaml
3
+ - ./base_text2mel.yaml
4
+
5
+ lambda_mel_adv: 0.05
6
+
7
+ disc_win_num: 3
8
+ mel_disc_hidden_size: 128
9
+ disc_norm: in
10
+ disc_reduction: stack
11
+ disc_interval: 1
12
+ disc_lr: 0.0001
13
+ disc_start_steps: 0
14
+ discriminator_scheduler_params:
15
+ gamma: 0.5
16
+ step_size: 40000
17
+ discriminator_optimizer_params:
18
+ eps: 1.0e-06
19
+ weight_decay: 0.0
egs/egs_bases/config_base.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # task
2
+ binary_data_dir: ''
3
+ work_dir: '' # experiment directory.
4
+ infer: false # infer
5
+ amp: false
6
+ seed: 1234
7
+ debug: false
8
+ save_codes: ['tasks', 'modules', 'egs']
9
+
10
+ #############
11
+ # dataset
12
+ #############
13
+ ds_workers: 1
14
+ test_num: 100
15
+ endless_ds: true
16
+ sort_by_len: true
17
+
18
+ #########
19
+ # train and eval
20
+ #########
21
+ print_nan_grads: false
22
+ load_ckpt: ''
23
+ save_best: false
24
+ num_ckpt_keep: 3
25
+ clip_grad_norm: 0
26
+ accumulate_grad_batches: 1
27
+ tb_log_interval: 100
28
+ num_sanity_val_steps: 5 # steps of validation at the beginning
29
+ check_val_every_n_epoch: 10
30
+ val_check_interval: 2000
31
+ valid_monitor_key: 'val_loss'
32
+ valid_monitor_mode: 'min'
33
+ max_epochs: 1000
34
+ max_updates: 1000000
35
+ max_tokens: 40000
36
+ max_sentences: 100000
37
+ max_valid_tokens: -1
38
+ max_valid_sentences: -1
39
+ eval_max_batches: -1
40
+ resume_from_checkpoint: 0
41
+ rename_tmux: true
egs/egs_bases/tts/base.yaml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # task
2
+ base_config:
3
+ - ../config_base.yaml
4
+ - ./dataset_params.yaml
5
+
6
+ #############
7
+ # dataset in training
8
+ #############
9
+ endless_ds: true
10
+ min_frames: 0
11
+ max_frames: 1548
12
+ frames_multiple: 1
13
+ max_input_tokens: 1550
14
+ ds_workers: 1
15
+
16
+ #########
17
+ # model
18
+ #########
19
+ use_spk_id: false
20
+ use_spk_embed: false
21
+ mel_losses: "ssim:0.5|l1:0.5"
22
+
23
+ ###########
24
+ # optimization
25
+ ###########
26
+ lr: 0.0005
27
+ scheduler: warmup # rsqrt|warmup|none
28
+ warmup_updates: 4000
29
+ optimizer_adam_beta1: 0.9
30
+ optimizer_adam_beta2: 0.98
31
+ weight_decay: 0
32
+ clip_grad_norm: 1
33
+ clip_grad_value: 0
34
+
35
+
36
+ ###########
37
+ # train and eval
38
+ ###########
39
+ use_word_input: false
40
+ max_valid_sentences: 1
41
+ max_valid_tokens: 60000
42
+ valid_infer_interval: 10000
43
+ train_set_name: 'train'
44
+ train_sets: ''
45
+ valid_set_name: 'valid'
46
+ test_set_name: 'test'
47
+ num_valid_plots: 10
48
+ test_ids: [ ]
49
+ test_input_yaml: ''
50
+ vocoder: HifiGAN
51
+ vocoder_ckpt: ''
52
+ profile_infer: false
53
+ out_wav_norm: false
54
+ save_gt: true
55
+ save_f0: false
56
+ gen_dir_name: ''
egs/egs_bases/tts/base_zh.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ base_config: ./base.yaml
2
+ preprocess_args:
3
+ txt_processor: zh
4
+
5
+ word_size: 3000
egs/egs_bases/tts/dataset_params.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio_num_mel_bins: 80
2
+ audio_sample_rate: 22050
3
+ hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
4
+ win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
5
+ fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter
6
+ fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
7
+ fmax: 7600 # To be increased/reduced depending on data.
8
+ f0_min: 80
9
+ f0_max: 800
10
+ griffin_lim_iters: 30
11
+ pitch_extractor: parselmouth
12
+ num_spk: 1
13
+ mel_vmin: -6
14
+ mel_vmax: 1.5
15
+ loud_norm: false
16
+
17
+ raw_data_dir: ''
18
+ processed_data_dir: ''
19
+ binary_data_dir: ''
20
+ preprocess_cls: ''
21
+ binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
22
+ preprocess_args:
23
+ nsample_per_mfa_group: 1000
24
+ # text process
25
+ txt_processor: en
26
+ use_mfa: true
27
+ with_phsep: true
28
+ reset_phone_dict: true
29
+ reset_word_dict: true
30
+ add_eos_bos: true
31
+ # mfa
32
+ mfa_group_shuffle: false
33
+ mfa_offset: 0.02
34
+ # wav processors
35
+ wav_processors: [ ]
36
+ save_sil_mask: true
37
+ vad_max_silence_length: 12
38
+ binarization_args:
39
+ shuffle: false
40
+ with_wav: false
41
+ with_align: true
42
+ with_spk_embed: false
43
+ with_f0: true
44
+ with_f0cwt: false
45
+ with_linear: false
46
+ trim_eos_bos: false
47
+ min_sil_duration: 0.1
48
+ train_range: [ 200, -1 ]
49
+ test_range: [ 0, 100 ]
50
+ valid_range: [ 100, 200 ]
51
+ word_dict_size: 10000
52
+ pitch_key: pitch
egs/egs_bases/tts/ds.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: ./fs2_orig.yaml
2
+
3
+ # special configs for diffspeech
4
+ task_cls: tasks.tts.diffspeech.DiffSpeechTask
5
+ lr: 0.001
6
+ timesteps: 100
7
+ K_step: 71
8
+ diff_loss_type: l1
9
+ diff_decoder_type: 'wavenet'
10
+ schedule_type: 'linear'
11
+ max_beta: 0.06
12
+
13
+ ## model configs for diffspeech
14
+ dilation_cycle_length: 1
15
+ residual_layers: 20
16
+ residual_channels: 256
17
+ decay_steps: 50000
18
+ keep_bins: 80
19
+ #content_cond_steps: [ ] # [ 0, 10000 ]
20
+ #spk_cond_steps: [ ] # [ 0, 10000 ]
21
+ #gen_tgt_spk_id: -1
22
+
23
+
24
+
25
+ # training configs for diffspeech
26
+ #max_sentences: 48
27
+ #num_sanity_val_steps: 1
28
+ num_valid_plots: 10
29
+ use_gt_dur: false
30
+ use_gt_f0: false
31
+ use_energy_embed: false
32
+ #pitch_type: cwt
33
+ max_updates: 160000
egs/egs_bases/tts/fs.yaml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: ./base.yaml
2
+ task_cls: tasks.tts.fs.FastSpeechTask
3
+
4
+ # model
5
+ hidden_size: 256
6
+ dropout: 0.0
7
+ encoder_type: rel_fft # rel_fft|fft|tacotron|tacotron2|conformer
8
+ decoder_type: conv # fft|rnn|conv|conformer|wn
9
+
10
+ # rnn enc/dec
11
+ encoder_K: 8
12
+ decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2
13
+
14
+ # fft enc/dec
15
+ enc_layers: 4
16
+ enc_ffn_kernel_size: 9
17
+ enc_prenet: true
18
+ enc_pre_ln: true
19
+ dec_layers: 4
20
+ dec_ffn_kernel_size: 9
21
+ num_heads: 2
22
+ ffn_act: gelu
23
+ ffn_hidden_size: 1024
24
+ use_pos_embed: true
25
+
26
+ # conv enc/dec
27
+ enc_dec_norm: ln
28
+ conv_use_pos: false
29
+ layers_in_block: 2
30
+ enc_dilations: [ 1, 1, 1, 1 ]
31
+ enc_kernel_size: 5
32
+ enc_post_net_kernel: 3
33
+ dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder
34
+ dec_kernel_size: 5
35
+ dec_post_net_kernel: 3
36
+
37
+ # duration
38
+ predictor_hidden: -1
39
+ dur_predictor_kernel: 3
40
+ dur_predictor_layers: 2
41
+ predictor_kernel: 5
42
+ predictor_layers: 5
43
+ predictor_dropout: 0.5
44
+
45
+ # pitch and energy
46
+ use_pitch_embed: false
47
+ pitch_type: frame # frame|ph|cwt
48
+ use_uv: true
49
+
50
+ # reference encoder and speaker embedding
51
+ lambda_commit: 0.25
52
+ ref_norm_layer: bn
53
+ dec_inp_add_noise: false
54
+
55
+ # mel
56
+ mel_losses: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
57
+
58
+ # loss lambda
59
+ lambda_f0: 1.0
60
+ lambda_uv: 1.0
61
+ lambda_energy: 0.1
62
+ lambda_ph_dur: 0.1
63
+ lambda_sent_dur: 1.0
64
+ lambda_word_dur: 1.0
65
+ predictor_grad: 0.1
66
+
67
+ # train and eval
68
+ warmup_updates: 4000
69
+ max_tokens: 40000
70
+ max_sentences: 128
71
+ max_valid_sentences: 1
72
+ max_updates: 160000
73
+ use_gt_dur: false
74
+ use_gt_f0: false
75
+ ds_workers: 2
egs/egs_bases/tts/fs2_orig.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: ./fs.yaml
2
+ task_cls: tasks.tts.fs2_orig.FastSpeech2OrigTask
3
+ encoder_type: fft
4
+ decoder_type: fft
5
+ use_energy_embed: false
6
+ use_pitch_embed: true
7
+ pitch_type: cwt # frame|ph|cwt
8
+ binarization_args:
9
+ with_f0cwt: true
10
+ use_gt_energy: false
11
+ cwt_std_scale: 0.8
12
+ dropout: 0.1
13
+ mel_losses: l1
egs/egs_bases/tts/ps.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: ./fs.yaml
2
+
3
+ ###########################
4
+ # models
5
+ ###########################
6
+ # encoders
7
+ hidden_size: 192
8
+ ffn_hidden_size: 768
9
+ enc_ffn_kernel_size: 5
10
+ enc_layers: 4
11
+ dur_level: word
12
+ encoder_type: rel_fft
13
+ use_word_encoder: true
14
+
15
+ # mix ling encoder
16
+ word_enc_layers: 4
17
+ word_encoder_type: rel_fft
18
+ use_pitch_embed: false
19
+ enc_prenet: true
20
+ enc_pre_ln: true
21
+ text_encoder_postnet: true
22
+ dropout: 0.0
23
+ add_word_pos: true
24
+
25
+ # dur predictor
26
+ dur_predictor_layers: 3
27
+ dur_predictor_kernel: 5
28
+ predictor_dropout: 0.2
29
+
30
+ ## fvae
31
+ use_fvae: true
32
+ latent_size: 16
33
+ fvae_encoder_type: conv
34
+ fvae_decoder_type: conv
35
+ fvae_enc_dec_hidden: 192
36
+ fvae_kernel_size: 5
37
+ fvae_enc_n_layers: 8
38
+ fvae_dec_n_layers: 4
39
+ fvae_strides: 4
40
+ fvae_noise_scale: 1.0
41
+
42
+ # prior flow
43
+ use_prior_flow: true
44
+ prior_flow_hidden: 64
45
+ prior_flow_kernel_size: 3
46
+ prior_flow_n_blocks: 4
47
+
48
+ ###########################
49
+ # training and inference
50
+ ###########################
51
+ lambda_kl: 1.0
52
+ kl_min: 0.0
53
+ lambda_sent_dur: 0.0
54
+ kl_start_steps: 10000
55
+ posterior_start_steps: 0
56
+ frames_multiple: 4
57
+ num_valid_plots: 10
58
+ lr: 0.0002
59
+ warmup_updates: 8000
60
+ max_tokens: 40000
61
+ valid_infer_interval: 10000
62
+ max_sentences: 80
63
+ max_updates: 480000
egs/egs_bases/tts/ps_flow.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: ./ps.yaml
2
+ task_cls: tasks.tts.ps_flow.PortaSpeechFlowTask
3
+
4
+ use_post_flow: true
5
+ detach_postflow_input: true
6
+ post_flow_lr: 0.001
7
+ post_glow_hidden: 192
8
+ post_glow_kernel_size: 3
9
+ post_glow_n_blocks: 12
10
+ post_glow_n_block_layers: 3
11
+ post_share_cond_layers: false
12
+ share_wn_layers: 4
13
+ use_cond_proj: false
14
+ use_latent_cond: false
15
+ use_txt_cond: true
16
+ sigmoid_scale: false
17
+ post_glow_training_start: 160000
18
+ noise_scale: 0.8
19
+ infer_post_glow: true
20
+ two_stage: true
egs/egs_bases/tts/ps_flow_small.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: ./ps_flow.yaml
2
+
3
+ ###########################
4
+ # models
5
+ ###########################
6
+ # encoders
7
+ hidden_size: 128
8
+ ffn_hidden_size: 512
9
+ enc_ffn_kernel_size: 3
10
+ enc_layers: 3
11
+ word_enc_layers: 3
12
+
13
+ # dur predictor
14
+ dur_predictor_layers: 3
15
+ dur_predictor_kernel: 5
16
+ predictor_dropout: 0.2
17
+
18
+ ## fvae
19
+ use_fvae: true
20
+ latent_size: 16
21
+ fvae_encoder_type: wn
22
+ fvae_decoder_type: wn
23
+ fvae_enc_dec_hidden: 128
24
+ fvae_kernel_size: 3
25
+ fvae_enc_n_layers: 8
26
+ fvae_dec_n_layers: 3
27
+ fvae_strides: 4
28
+ fvae_noise_scale: 1.0
29
+
30
+
31
+ # prior flow
32
+ use_prior_flow: true
33
+ prior_flow_hidden: 32
34
+ prior_flow_kernel_size: 3
35
+ prior_flow_n_blocks: 3
36
+ # post flow
37
+ post_glow_hidden: 128
38
+ post_glow_kernel_size: 3
39
+ post_glow_n_blocks: 8
40
+ post_glow_n_block_layers: 3
41
+ share_wn_layers: 4
42
+ noise_scale: 0.6
egs/egs_bases/tts/synta.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: ./ps.yaml
2
+ task_cls: tasks.tts.synta.SyntaSpeechTask
3
+
4
+ use_post_flow: true
5
+ detach_postflow_input: true
6
+ post_flow_lr: 0.001
7
+ post_glow_hidden: 192
8
+ post_glow_kernel_size: 3
9
+ post_glow_n_blocks: 12
10
+ post_glow_n_block_layers: 3
11
+ post_share_cond_layers: false
12
+ share_wn_layers: 4
13
+ use_cond_proj: false
14
+ use_latent_cond: false
15
+ use_txt_cond: true
16
+ sigmoid_scale: false
17
+ post_glow_training_start: 160000
18
+ noise_scale: 0.8
19
+ infer_post_glow: true
20
+ two_stage: true
egs/egs_bases/tts/vocoder/base.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config:
2
+ - egs/egs_bases/config_base.yaml
3
+ - ../dataset_params.yaml
4
+ binarization_args:
5
+ with_wav: true
6
+ with_spk_embed: false
7
+ with_align: false
8
+
9
+ generator_grad_norm: 10.0 # Generator's gradient norm.
10
+ discriminator_grad_norm: 1.0 # Discriminator's gradient norm.
11
+
12
+ ###########
13
+ # train and eval
14
+ ###########
15
+ max_samples: 20480
16
+ max_sentences: 8
17
+ max_valid_sentences: 1
18
+ max_updates: 2000000
19
+ val_check_interval: 5000
20
+ valid_infer_interval: 50000
egs/egs_bases/tts/vocoder/hifigan.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_config: ./base.yaml
2
+ task_cls: tasks.vocoder.hifigan.HifiGanTask
3
+ resblock: "1"
4
+ adam_b1: 0.8
5
+ adam_b2: 0.99
6
+ upsample_rates: [ 8,8,2,2 ]
7
+ upsample_kernel_sizes: [ 16,16,4,4 ]
8
+ upsample_initial_channel: 512
9
+ resblock_kernel_sizes: [ 3,7,11 ]
10
+ resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
11
+
12
+ use_pitch_embed: false
13
+ use_fm_loss: false
14
+ use_ms_stft: false
15
+
16
+ lambda_mel: 5.0
17
+ lambda_mel_adv: 1.0
18
+ lambda_cdisc: 4.0
19
+ lambda_adv: 1.0
20
+
21
+ lr: 0.0002 # Generator's learning rate.
22
+ generator_scheduler_params:
23
+ step_size: 600
24
+ gamma: 0.999
25
+ discriminator_scheduler_params:
26
+ step_size: 600
27
+ gamma: 0.999
28
+ max_updates: 3000000
inference/tts/__pycache__/base_tts_infer.cpython-36.pyc ADDED
Binary file (4.19 kB). View file
 
inference/tts/base_tts_infer.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+
5
+ from modules.vocoder.hifigan.hifigan import HifiGanGenerator
6
+ from tasks.tts.dataset_utils import FastSpeechWordDataset
7
+ from tasks.tts.tts_utils import load_data_preprocessor
8
+ from utils.commons.ckpt_utils import load_ckpt
9
+ from utils.commons.hparams import set_hparams
10
+
11
+
12
+ class BaseTTSInfer:
13
+ def __init__(self, hparams, device=None):
14
+ if device is None:
15
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
16
+ self.hparams = hparams
17
+ self.device = device
18
+ self.data_dir = hparams['binary_data_dir']
19
+ self.preprocessor, self.preprocess_args = load_data_preprocessor()
20
+ self.ph_encoder, self.word_encoder = self.preprocessor.load_dict(self.data_dir)
21
+ self.spk_map = self.preprocessor.load_spk_map(self.data_dir)
22
+ self.ds_cls = FastSpeechWordDataset
23
+ self.model = self.build_model()
24
+ self.model.eval()
25
+ self.model.to(self.device)
26
+ self.vocoder = self.build_vocoder()
27
+ self.vocoder.eval()
28
+ self.vocoder.to(self.device)
29
+
30
+ def build_model(self):
31
+ raise NotImplementedError
32
+
33
+ def forward_model(self, inp):
34
+ raise NotImplementedError
35
+
36
+ def build_vocoder(self):
37
+ base_dir = self.hparams['vocoder_ckpt']
38
+ config_path = f'{base_dir}/config.yaml'
39
+ config = set_hparams(config_path, global_hparams=False)
40
+ vocoder = HifiGanGenerator(config)
41
+ load_ckpt(vocoder, base_dir, 'model_gen')
42
+ return vocoder
43
+
44
+ def run_vocoder(self, c):
45
+ c = c.transpose(2, 1)
46
+ y = self.vocoder(c)[:, 0]
47
+ return y
48
+
49
+ def preprocess_input(self, inp):
50
+ """
51
+
52
+ :param inp: {'text': str, 'item_name': (str, optional), 'spk_name': (str, optional)}
53
+ :return:
54
+ """
55
+ preprocessor, preprocess_args = self.preprocessor, self.preprocess_args
56
+ text_raw = inp['text']
57
+ item_name = inp.get('item_name', '<ITEM_NAME>')
58
+ spk_name = inp.get('spk_name', '<SINGLE_SPK>')
59
+ ph, txt, word, ph2word, ph_gb_word = preprocessor.txt_to_ph(
60
+ preprocessor.txt_processor, text_raw, preprocess_args)
61
+ word_token = self.word_encoder.encode(word)
62
+ ph_token = self.ph_encoder.encode(ph)
63
+ spk_id = self.spk_map[spk_name]
64
+ item = {'item_name': item_name, 'text': txt, 'ph': ph, 'spk_id': spk_id,
65
+ 'ph_token': ph_token, 'word_token': word_token, 'ph2word': ph2word,
66
+ 'ph_words':ph_gb_word, 'words': word}
67
+ item['ph_len'] = len(item['ph_token'])
68
+ return item
69
+
70
+ def input_to_batch(self, item):
71
+ item_names = [item['item_name']]
72
+ text = [item['text']]
73
+ ph = [item['ph']]
74
+ txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
75
+ txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
76
+ word_tokens = torch.LongTensor(item['word_token'])[None, :].to(self.device)
77
+ word_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
78
+ ph2word = torch.LongTensor(item['ph2word'])[None, :].to(self.device)
79
+ spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)
80
+ batch = {
81
+ 'item_name': item_names,
82
+ 'text': text,
83
+ 'ph': ph,
84
+ 'txt_tokens': txt_tokens,
85
+ 'txt_lengths': txt_lengths,
86
+ 'word_tokens': word_tokens,
87
+ 'word_lengths': word_lengths,
88
+ 'ph2word': ph2word,
89
+ 'spk_ids': spk_ids,
90
+ }
91
+ return batch
92
+
93
+ def postprocess_output(self, output):
94
+ return output
95
+
96
+ def infer_once(self, inp):
97
+ inp = self.preprocess_input(inp)
98
+ output = self.forward_model(inp)
99
+ output = self.postprocess_output(output)
100
+ return output
101
+
102
+ @classmethod
103
+ def example_run(cls):
104
+ from utils.commons.hparams import set_hparams
105
+ from utils.commons.hparams import hparams as hp
106
+ from utils.audio.io import save_wav
107
+
108
+ set_hparams()
109
+ if hp['ds_name'] in ['lj', 'libritts']:
110
+ inp = {
111
+ 'text': 'the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.'
112
+ }
113
+ elif hp['ds_name'] in ['biaobei']:
114
+ inp = {
115
+ 'text': '如果我想你三遍,天上乌云就散一片。'
116
+ }
117
+ infer_ins = cls(hp)
118
+ out = infer_ins.infer_once(inp)
119
+ os.makedirs('infer_out', exist_ok=True)
120
+ save_wav(out, f'infer_out/example_out.wav', hp['audio_sample_rate'])
inference/tts/ds.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ # from inference.tts.fs import FastSpeechInfer
3
+ # from modules.tts.fs2_orig import FastSpeech2Orig
4
+ from inference.tts.base_tts_infer import BaseTTSInfer
5
+ from modules.tts.diffspeech.shallow_diffusion_tts import GaussianDiffusion
6
+ from utils.commons.ckpt_utils import load_ckpt
7
+ from utils.commons.hparams import hparams
8
+
9
+
10
+ class DiffSpeechInfer(BaseTTSInfer):
11
+ def build_model(self):
12
+ dict_size = len(self.ph_encoder)
13
+ model = GaussianDiffusion(dict_size, self.hparams)
14
+ model.eval()
15
+ load_ckpt(model, hparams['work_dir'], 'model')
16
+ return model
17
+
18
+ def forward_model(self, inp):
19
+ sample = self.input_to_batch(inp)
20
+ txt_tokens = sample['txt_tokens'] # [B, T_t]
21
+ spk_id = sample.get('spk_ids')
22
+ with torch.no_grad():
23
+ output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True)
24
+ mel_out = output['mel_out']
25
+ wav_out = self.run_vocoder(mel_out)
26
+ wav_out = wav_out.cpu().numpy()
27
+ return wav_out[0]
28
+
29
+ if __name__ == '__main__':
30
+ DiffSpeechInfer.example_run()
inference/tts/fs.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from inference.tts.base_tts_infer import BaseTTSInfer
3
+ from modules.tts.fs import FastSpeech
4
+ from utils.commons.ckpt_utils import load_ckpt
5
+ from utils.commons.hparams import hparams
6
+
7
+
8
+ class FastSpeechInfer(BaseTTSInfer):
9
+ def build_model(self):
10
+ dict_size = len(self.ph_encoder)
11
+ model = FastSpeech(dict_size, self.hparams)
12
+ model.eval()
13
+ load_ckpt(model, hparams['work_dir'], 'model')
14
+ return model
15
+
16
+ def forward_model(self, inp):
17
+ sample = self.input_to_batch(inp)
18
+ txt_tokens = sample['txt_tokens'] # [B, T_t]
19
+ spk_id = sample.get('spk_ids')
20
+ with torch.no_grad():
21
+ output = self.model(txt_tokens, spk_id=spk_id, infer=True)
22
+ mel_out = output['mel_out']
23
+ wav_out = self.run_vocoder(mel_out)
24
+ wav_out = wav_out.cpu().numpy()
25
+ return wav_out[0]
26
+
27
+
28
+ if __name__ == '__main__':
29
+ FastSpeechInfer.example_run()
inference/tts/fs2_orig.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from inference.tts.fs import FastSpeechInfer
2
+ from modules.tts.fs2_orig import FastSpeech2Orig
3
+ from utils.commons.ckpt_utils import load_ckpt
4
+ from utils.commons.hparams import hparams
5
+
6
+
7
+ class FastSpeech2OrigInfer(FastSpeechInfer):
8
+ def build_model(self):
9
+ dict_size = len(self.ph_encoder)
10
+ model = FastSpeech2Orig(dict_size, self.hparams)
11
+ model.eval()
12
+ load_ckpt(model, hparams['work_dir'], 'model')
13
+ return model
14
+
15
+
16
+ if __name__ == '__main__':
17
+ FastSpeech2OrigInfer.example_run()
inference/tts/gradio/gradio_settings.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title: 'yerfor/SyntaSpeech'
2
+ description: |
3
+ Gradio demo for yerfor/SyntaSpeech. To use it, simply add your audio, or click one of the examples to load them. Note: This space is running on CPU, inference times will be higher.
4
+ article: |
5
+ Link to <a href='https://github.com/yerfor/SyntaSpeech' style='color:blue;' target='_blank\'>Github REPO</a>
6
+ example_inputs:
7
+ - |-
8
+ the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.
9
+ - |-
10
+ produced the block books, which were the immediate predecessors of the true printed book,
11
+ inference_cls: inference.tts.synta.SyntaSpeechInfer
12
+ exp_name: lj_synta
inference/tts/gradio/infer.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib
2
+ import re
3
+
4
+ import gradio as gr
5
+ import yaml
6
+ from gradio.inputs import Textbox
7
+
8
+ from inference.tts.base_tts_infer import BaseTTSInfer
9
+ from utils.commons.hparams import set_hparams
10
+ from utils.commons.hparams import hparams as hp
11
+ import numpy as np
12
+
13
+ from utils.text.text_encoder import PUNCS
14
+
15
+
16
+ class GradioInfer:
17
+ def __init__(self, exp_name, inference_cls, title, description, article, example_inputs):
18
+ self.exp_name = exp_name
19
+ self.title = title
20
+ self.description = description
21
+ self.article = article
22
+ self.example_inputs = example_inputs
23
+ pkg = ".".join(inference_cls.split(".")[:-1])
24
+ cls_name = inference_cls.split(".")[-1]
25
+ self.inference_cls = getattr(importlib.import_module(pkg), cls_name)
26
+
27
+ def greet(self, text):
28
+ sents = re.split(rf'([{PUNCS}])', text.replace('\n', ','))
29
+ if sents[-1] not in list(PUNCS):
30
+ sents = sents + ['.']
31
+ audio_outs = []
32
+ s = ""
33
+ for i in range(0, len(sents), 2):
34
+ if len(sents[i]) > 0:
35
+ s += sents[i] + sents[i + 1]
36
+ if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0):
37
+ audio_out = self.infer_ins.infer_once({
38
+ 'text': s
39
+ })
40
+ audio_out = audio_out * 32767
41
+ audio_out = audio_out.astype(np.int16)
42
+ audio_outs.append(audio_out)
43
+ audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16))
44
+ s = ""
45
+ audio_outs = np.concatenate(audio_outs)
46
+ return hp['audio_sample_rate'], audio_outs
47
+
48
+ def run(self):
49
+ set_hparams(exp_name=self.exp_name)
50
+ infer_cls = self.inference_cls
51
+ self.infer_ins: BaseTTSInfer = infer_cls(hp)
52
+ example_inputs = self.example_inputs
53
+ iface = gr.Interface(fn=self.greet,
54
+ inputs=Textbox(
55
+ lines=10, placeholder=None, default=example_inputs[0], label="input text"),
56
+ outputs="audio",
57
+ allow_flagging="never",
58
+ title=self.title,
59
+ description=self.description,
60
+ article=self.article,
61
+ examples=example_inputs,
62
+ enable_queue=True)
63
+ iface.launch(share=True,cache_examples=True)
64
+
65
+
66
+ if __name__ == '__main__':
67
+ gradio_config = yaml.safe_load(open('inference/tts/gradio/gradio_settings.yaml'))
68
+ g = GradioInfer(**gradio_config)
69
+ g.run()
inference/tts/ps_flow.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from inference.tts.base_tts_infer import BaseTTSInfer
3
+ from modules.tts.portaspeech.portaspeech_flow import PortaSpeechFlow
4
+ from utils.commons.ckpt_utils import load_ckpt
5
+ from utils.commons.hparams import hparams
6
+
7
+
8
+ class PortaSpeechFlowInfer(BaseTTSInfer):
9
+ def build_model(self):
10
+ ph_dict_size = len(self.ph_encoder)
11
+ word_dict_size = len(self.word_encoder)
12
+ model = PortaSpeechFlow(ph_dict_size, word_dict_size, self.hparams)
13
+ load_ckpt(model, hparams['work_dir'], 'model')
14
+ model.to(self.device)
15
+ with torch.no_grad():
16
+ model.store_inverse_all()
17
+ model.eval()
18
+ return model
19
+
20
+ def forward_model(self, inp):
21
+ sample = self.input_to_batch(inp)
22
+ with torch.no_grad():
23
+ output = self.model(
24
+ sample['txt_tokens'],
25
+ sample['word_tokens'],
26
+ ph2word=sample['ph2word'],
27
+ word_len=sample['word_lengths'].max(),
28
+ infer=True,
29
+ forward_post_glow=True,
30
+ spk_id=sample.get('spk_ids')
31
+ )
32
+ mel_out = output['mel_out']
33
+ wav_out = self.run_vocoder(mel_out)
34
+ wav_out = wav_out.cpu().numpy()
35
+ return wav_out[0]
36
+
37
+
38
+ if __name__ == '__main__':
39
+ PortaSpeechFlowInfer.example_run()
inference/tts/synta.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from inference.tts.base_tts_infer import BaseTTSInfer
3
+ from modules.tts.syntaspeech.syntaspeech import SyntaSpeech
4
+ from utils.commons.ckpt_utils import load_ckpt
5
+ from utils.commons.hparams import hparams
6
+
7
+ from modules.tts.syntaspeech.syntactic_graph_buider import Sentence2GraphParser
8
+
9
+ class SyntaSpeechInfer(BaseTTSInfer):
10
+ def __init__(self, hparams, device=None):
11
+ super().__init__(hparams, device)
12
+ if hparams['ds_name'] in ['biaobei']:
13
+ self.syntactic_graph_builder = Sentence2GraphParser(language='zh')
14
+ elif hparams['ds_name'] in ['ljspeech', 'libritts']:
15
+ self.syntactic_graph_builder = Sentence2GraphParser(language='en')
16
+
17
+ def build_model(self):
18
+ ph_dict_size = len(self.ph_encoder)
19
+ word_dict_size = len(self.word_encoder)
20
+ model = SyntaSpeech(ph_dict_size, word_dict_size, self.hparams)
21
+ load_ckpt(model, hparams['work_dir'], 'model')
22
+ model.to(self.device)
23
+ with torch.no_grad():
24
+ model.store_inverse_all()
25
+ model.eval()
26
+ return model
27
+
28
+ def input_to_batch(self, item):
29
+ item_names = [item['item_name']]
30
+ text = [item['text']]
31
+ ph = [item['ph']]
32
+ txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
33
+ txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
34
+ word_tokens = torch.LongTensor(item['word_token'])[None, :].to(self.device)
35
+ word_lengths = torch.LongTensor([word_tokens.shape[1]]).to(self.device)
36
+ ph2word = torch.LongTensor(item['ph2word'])[None, :].to(self.device)
37
+ spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)
38
+ dgl_graph, etypes = self.syntactic_graph_builder.parse(item['text'], words=item['words'].split(" "), ph_words=item['ph_words'].split(" "))
39
+ dgl_graph = dgl_graph.to(self.device)
40
+ etypes = etypes.to(self.device)
41
+ batch = {
42
+ 'item_name': item_names,
43
+ 'text': text,
44
+ 'ph': ph,
45
+ 'txt_tokens': txt_tokens,
46
+ 'txt_lengths': txt_lengths,
47
+ 'word_tokens': word_tokens,
48
+ 'word_lengths': word_lengths,
49
+ 'ph2word': ph2word,
50
+ 'spk_ids': spk_ids,
51
+ 'graph_lst': [dgl_graph],
52
+ 'etypes_lst': [etypes]
53
+ }
54
+ return batch
55
+ def forward_model(self, inp):
56
+ sample = self.input_to_batch(inp)
57
+ with torch.no_grad():
58
+ output = self.model(
59
+ sample['txt_tokens'],
60
+ sample['word_tokens'],
61
+ ph2word=sample['ph2word'],
62
+ word_len=sample['word_lengths'].max(),
63
+ infer=True,
64
+ forward_post_glow=True,
65
+ spk_id=sample.get('spk_ids'),
66
+ graph_lst=sample['graph_lst'],
67
+ etypes_lst=sample['etypes_lst']
68
+ )
69
+ mel_out = output['mel_out']
70
+ wav_out = self.run_vocoder(mel_out)
71
+ wav_out = wav_out.cpu().numpy()
72
+ return wav_out[0]
73
+
74
+
75
+ if __name__ == '__main__':
76
+ SyntaSpeechInfer.example_run()
modules/commons/__pycache__/conv.cpython-36.pyc ADDED
Binary file (6.54 kB). View file