Spaces:
Build error
Build error
init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +4 -6
- data/binary/ljspeech/phone_set.json +1 -0
- data/binary/ljspeech/spk_map.json +1 -0
- data/binary/ljspeech/word_set.json +0 -0
- egs/datasets/audio/biaobei/__pycache__/preprocess.cpython-36.pyc +0 -0
- egs/datasets/audio/biaobei/base_text2mel.yaml +18 -0
- egs/datasets/audio/biaobei/preprocess.py +16 -0
- egs/datasets/audio/biaobei/ps_flow.yaml +3 -0
- egs/datasets/audio/biaobei/synta.yaml +19 -0
- egs/datasets/audio/libritts/__pycache__/preprocess.cpython-36.pyc +0 -0
- egs/datasets/audio/libritts/base_text2mel.yaml +22 -0
- egs/datasets/audio/libritts/preprocess.py +13 -0
- egs/datasets/audio/libritts/ps_flow.yaml +3 -0
- egs/datasets/audio/libritts/synta.yaml +19 -0
- egs/datasets/audio/lj/__pycache__/preprocess.cpython-36.pyc +0 -0
- egs/datasets/audio/lj/base_mel2wav.yaml +4 -0
- egs/datasets/audio/lj/base_text2mel.yaml +17 -0
- egs/datasets/audio/lj/ds.yaml +29 -0
- egs/datasets/audio/lj/fs.yaml +3 -0
- egs/datasets/audio/lj/fs2_orig.yaml +4 -0
- egs/datasets/audio/lj/hifigan.yaml +3 -0
- egs/datasets/audio/lj/preprocess.py +9 -0
- egs/datasets/audio/lj/ps_flow.yaml +3 -0
- egs/datasets/audio/lj/ps_flow_nips2021.yaml +11 -0
- egs/datasets/audio/lj/ps_flow_small.yaml +3 -0
- egs/datasets/audio/lj/ps_flow_small_nips2021.yaml +11 -0
- egs/datasets/audio/lj/synta.yaml +19 -0
- egs/egs_bases/config_base.yaml +41 -0
- egs/egs_bases/tts/base.yaml +56 -0
- egs/egs_bases/tts/base_zh.yaml +5 -0
- egs/egs_bases/tts/dataset_params.yaml +52 -0
- egs/egs_bases/tts/ds.yaml +33 -0
- egs/egs_bases/tts/fs.yaml +75 -0
- egs/egs_bases/tts/fs2_orig.yaml +13 -0
- egs/egs_bases/tts/ps.yaml +63 -0
- egs/egs_bases/tts/ps_flow.yaml +20 -0
- egs/egs_bases/tts/ps_flow_small.yaml +42 -0
- egs/egs_bases/tts/synta.yaml +20 -0
- egs/egs_bases/tts/vocoder/base.yaml +20 -0
- egs/egs_bases/tts/vocoder/hifigan.yaml +28 -0
- inference/tts/__pycache__/base_tts_infer.cpython-36.pyc +0 -0
- inference/tts/base_tts_infer.py +120 -0
- inference/tts/ds.py +30 -0
- inference/tts/fs.py +29 -0
- inference/tts/fs2_orig.py +17 -0
- inference/tts/gradio/gradio_settings.yaml +12 -0
- inference/tts/gradio/infer.py +69 -0
- inference/tts/ps_flow.py +39 -0
- inference/tts/synta.py +76 -0
- modules/commons/__pycache__/conv.cpython-36.pyc +0 -0
README.md
CHANGED
@@ -1,12 +1,10 @@
|
|
1 |
---
|
2 |
title: SyntaSpeech
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
|
8 |
-
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
|
|
|
1 |
---
|
2 |
title: SyntaSpeech
|
3 |
+
emoji: 🤗
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: orange
|
6 |
sdk: gradio
|
7 |
+
app_file: "inference/tts/gradio/infer.py"
|
|
|
8 |
pinned: false
|
9 |
---
|
10 |
|
|
data/binary/ljspeech/phone_set.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH"]
|
data/binary/ljspeech/spk_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"<SINGLE_SPK>": 0}
|
data/binary/ljspeech/word_set.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
egs/datasets/audio/biaobei/__pycache__/preprocess.cpython-36.pyc
ADDED
Binary file (1.12 kB). View file
|
|
egs/datasets/audio/biaobei/base_text2mel.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: egs/egs_bases/tts/base_zh.yaml
|
2 |
+
raw_data_dir: 'data/raw/biaobei'
|
3 |
+
processed_data_dir: 'data/processed/biaobei'
|
4 |
+
binary_data_dir: 'data/binary/biaobei'
|
5 |
+
preprocess_cls: egs.datasets.audio.biaobei.preprocess.BiaobeiPreprocess
|
6 |
+
|
7 |
+
ds_name: biaobei
|
8 |
+
binarization_args:
|
9 |
+
train_range: [ 871, -1 ]
|
10 |
+
test_range: [ 0, 523 ]
|
11 |
+
valid_range: [ 523, 871 ]
|
12 |
+
test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
13 |
+
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
14 |
+
68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
|
15 |
+
316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
|
16 |
+
f0_min: 80
|
17 |
+
f0_max: 600
|
18 |
+
vocoder_ckpt: checkpoints/hifi_biaobei
|
egs/datasets/audio/biaobei/preprocess.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from data_gen.tts.base_preprocess import BasePreprocessor
|
2 |
+
import re
|
3 |
+
|
4 |
+
|
5 |
+
class BiaobeiPreprocess(BasePreprocessor):
|
6 |
+
def meta_data(self):
|
7 |
+
input_dir = self.raw_data_dir
|
8 |
+
with open(f"{input_dir}/ProsodyLabeling/000001-010000.txt", encoding='utf-8') as f:
|
9 |
+
bb_lines = f.readlines()[::2]
|
10 |
+
for l_idx, l in (enumerate([re.sub("\#\d+", "", l.split('\t')[1].strip()) for l in bb_lines])):
|
11 |
+
item_name = f'{l_idx + 1:06d}'
|
12 |
+
wav_fn = f"{input_dir}/wav/{l_idx + 1:06d}.wav"
|
13 |
+
yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': l}
|
14 |
+
|
15 |
+
if __name__ == "__main__":
|
16 |
+
BiaobeiPreprocess().process()
|
egs/datasets/audio/biaobei/ps_flow.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/ps_flow.yaml
|
3 |
+
- ./base_text2mel.yaml
|
egs/datasets/audio/biaobei/synta.yaml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/synta.yaml
|
3 |
+
- ./base_text2mel.yaml
|
4 |
+
|
5 |
+
lambda_mel_adv: 0.05
|
6 |
+
|
7 |
+
disc_win_num: 3
|
8 |
+
mel_disc_hidden_size: 128
|
9 |
+
disc_norm: in
|
10 |
+
disc_reduction: stack
|
11 |
+
disc_interval: 1
|
12 |
+
disc_lr: 0.0001
|
13 |
+
disc_start_steps: 0
|
14 |
+
discriminator_scheduler_params:
|
15 |
+
gamma: 0.5
|
16 |
+
step_size: 40000
|
17 |
+
discriminator_optimizer_params:
|
18 |
+
eps: 1.0e-06
|
19 |
+
weight_decay: 0.0
|
egs/datasets/audio/libritts/__pycache__/preprocess.cpython-36.pyc
ADDED
Binary file (893 Bytes). View file
|
|
egs/datasets/audio/libritts/base_text2mel.yaml
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ds_name: libritts
|
2 |
+
base_config: egs/egs_bases/tts/base.yaml
|
3 |
+
raw_data_dir: 'data/raw/LibriTTS'
|
4 |
+
processed_data_dir: 'data/processed/libritts'
|
5 |
+
binary_data_dir: 'data/binary/libritts'
|
6 |
+
preprocess_cls: egs.datasets.audio.libritts.preprocess.LibriTTSPreprocess
|
7 |
+
binarization_args:
|
8 |
+
train_range: [ 871, -1 ]
|
9 |
+
test_range: [ 0, 523 ]
|
10 |
+
valid_range: [ 523, 871 ]
|
11 |
+
shuffle: false
|
12 |
+
with_spk_id: true
|
13 |
+
with_spk_embed: false
|
14 |
+
test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
15 |
+
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
16 |
+
68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
|
17 |
+
316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
|
18 |
+
f0_min: 80
|
19 |
+
f0_max: 600
|
20 |
+
vocoder: PWG
|
21 |
+
vocoder_ckpt: checkpoints/pwg_libritts
|
22 |
+
num_spk: 2000
|
egs/datasets/audio/libritts/preprocess.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from data_gen.tts.base_preprocess import BasePreprocessor
|
2 |
+
import glob, os
|
3 |
+
|
4 |
+
class LibriTTSPreprocess(BasePreprocessor):
|
5 |
+
def meta_data(self):
|
6 |
+
wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*/*.wav'))
|
7 |
+
for wav_fn in wav_fns:
|
8 |
+
item_name = os.path.basename(wav_fn)[:-4]
|
9 |
+
txt_fn = f'{wav_fn[:-4]}.normalized.txt'
|
10 |
+
with open(txt_fn, 'r') as f:
|
11 |
+
txt = f.read()
|
12 |
+
spk_name = item_name.split("_")[0]
|
13 |
+
yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt, 'spk_name': spk_name}
|
egs/datasets/audio/libritts/ps_flow.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/ps_flow.yaml
|
3 |
+
- ./base_text2mel.yaml
|
egs/datasets/audio/libritts/synta.yaml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/synta.yaml
|
3 |
+
- ./base_text2mel.yaml
|
4 |
+
|
5 |
+
lambda_mel_adv: 0.05
|
6 |
+
|
7 |
+
disc_win_num: 3
|
8 |
+
mel_disc_hidden_size: 128
|
9 |
+
disc_norm: in
|
10 |
+
disc_reduction: stack
|
11 |
+
disc_interval: 1
|
12 |
+
disc_lr: 0.0001
|
13 |
+
disc_start_steps: 0
|
14 |
+
discriminator_scheduler_params:
|
15 |
+
gamma: 0.5
|
16 |
+
step_size: 40000
|
17 |
+
discriminator_optimizer_params:
|
18 |
+
eps: 1.0e-06
|
19 |
+
weight_decay: 0.0
|
egs/datasets/audio/lj/__pycache__/preprocess.cpython-36.pyc
ADDED
Binary file (711 Bytes). View file
|
|
egs/datasets/audio/lj/base_mel2wav.yaml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: egs/egs_bases/tts/vocoder/base.yaml
|
2 |
+
raw_data_dir: 'data/raw/LJSpeech-1.1'
|
3 |
+
processed_data_dir: 'data/processed/ljspeech'
|
4 |
+
binary_data_dir: 'data/binary/ljspeech_wav'
|
egs/datasets/audio/lj/base_text2mel.yaml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ds_name: ljspeech
|
2 |
+
base_config: egs/egs_bases/tts/base.yaml
|
3 |
+
raw_data_dir: 'data/raw/LJSpeech-1.1'
|
4 |
+
processed_data_dir: 'data/processed/ljspeech'
|
5 |
+
binary_data_dir: 'data/binary/ljspeech'
|
6 |
+
preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
|
7 |
+
binarization_args:
|
8 |
+
train_range: [ 871, -1 ]
|
9 |
+
test_range: [ 0, 523 ]
|
10 |
+
valid_range: [ 523, 871 ]
|
11 |
+
test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
|
12 |
+
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
13 |
+
68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
|
14 |
+
316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
|
15 |
+
f0_min: 80
|
16 |
+
f0_max: 600
|
17 |
+
vocoder_ckpt: checkpoints/hifi_lj
|
egs/datasets/audio/lj/ds.yaml
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/ds.yaml
|
3 |
+
- ./fs2_orig.yaml
|
4 |
+
|
5 |
+
fs2_ckpt: checkpoints/aux_exp/model_ckpt_steps_100000.ckpt
|
6 |
+
|
7 |
+
# spec_min and spec_max are calculated on the training set.
|
8 |
+
spec_min: [ -4.7574, -4.6783, -4.6431, -4.5832, -4.5390, -4.6771, -4.8089, -4.7672,
|
9 |
+
-4.5784, -4.7755, -4.7150, -4.8919, -4.8271, -4.7389, -4.6047, -4.7759,
|
10 |
+
-4.6799, -4.8201, -4.7823, -4.8262, -4.7857, -4.7545, -4.9358, -4.9733,
|
11 |
+
-5.1134, -5.1395, -4.9016, -4.8434, -5.0189, -4.8460, -5.0529, -4.9510,
|
12 |
+
-5.0217, -5.0049, -5.1831, -5.1445, -5.1015, -5.0281, -4.9887, -4.9916,
|
13 |
+
-4.9785, -4.9071, -4.9488, -5.0342, -4.9332, -5.0650, -4.8924, -5.0875,
|
14 |
+
-5.0483, -5.0848, -5.0655, -5.0279, -5.0015, -5.0792, -5.0636, -5.2413,
|
15 |
+
-5.1421, -5.1710, -5.3256, -5.0511, -5.1186, -5.0057, -5.0446, -5.1173,
|
16 |
+
-5.0325, -5.1085, -5.0053, -5.0755, -5.1176, -5.1004, -5.2153, -5.2757,
|
17 |
+
-5.3025, -5.2867, -5.2918, -5.3328, -5.2731, -5.2985, -5.2400, -5.2211 ]
|
18 |
+
spec_max: [ -0.5982, -0.0778, 0.1205, 0.2747, 0.4657, 0.5123, 0.5830, 0.7093,
|
19 |
+
0.6461, 0.6101, 0.7316, 0.7715, 0.7681, 0.8349, 0.7815, 0.7591,
|
20 |
+
0.7910, 0.7433, 0.7352, 0.6869, 0.6854, 0.6623, 0.5353, 0.6492,
|
21 |
+
0.6909, 0.6106, 0.5761, 0.5236, 0.5638, 0.4054, 0.4545, 0.3407,
|
22 |
+
0.3037, 0.3380, 0.1599, 0.1603, 0.2741, 0.2130, 0.1569, 0.1911,
|
23 |
+
0.2324, 0.1586, 0.1221, 0.0341, -0.0558, 0.0553, -0.1153, -0.0933,
|
24 |
+
-0.1171, -0.0050, -0.1519, -0.1629, -0.0522, -0.0739, -0.2069, -0.2405,
|
25 |
+
-0.1244, -0.2582, -0.1361, -0.1575, -0.1442, 0.0513, -0.1567, -0.2000,
|
26 |
+
0.0086, -0.0698, 0.1385, 0.0941, 0.1864, 0.1225, 0.1389, 0.1382,
|
27 |
+
0.1670, 0.1007, 0.1444, 0.0888, 0.1998, 0.2280, 0.2932, 0.3047 ]
|
28 |
+
|
29 |
+
max_tokens: 30000
|
egs/datasets/audio/lj/fs.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/fs.yaml
|
3 |
+
- ./base_text2mel.yaml
|
egs/datasets/audio/lj/fs2_orig.yaml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/fs2_orig.yaml
|
3 |
+
- ./base_text2mel.yaml
|
4 |
+
binary_data_dir: 'data/binary/ljspeech_cwt'
|
egs/datasets/audio/lj/hifigan.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/vocoder/hifigan.yaml
|
3 |
+
- ./base_mel2wav.yaml
|
egs/datasets/audio/lj/preprocess.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from data_gen.tts.base_preprocess import BasePreprocessor
|
2 |
+
|
3 |
+
|
4 |
+
class LJPreprocess(BasePreprocessor):
|
5 |
+
def meta_data(self):
|
6 |
+
for l in open(f'{self.raw_data_dir}/metadata.csv').readlines():
|
7 |
+
item_name, _, txt = l.strip().split("|")
|
8 |
+
wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav"
|
9 |
+
yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt}
|
egs/datasets/audio/lj/ps_flow.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/ps_flow.yaml
|
3 |
+
- ./base_text2mel.yaml
|
egs/datasets/audio/lj/ps_flow_nips2021.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- ./ps_flow.yaml
|
3 |
+
max_sentences: 64
|
4 |
+
dur_level: word
|
5 |
+
use_word_encoder: false
|
6 |
+
enc_prenet: true
|
7 |
+
enc_pre_ln: false
|
8 |
+
fvae_encoder_type: wn
|
9 |
+
fvae_decoder_type: wn
|
10 |
+
text_encoder_postnet: false
|
11 |
+
warmup_updates: 8000
|
egs/datasets/audio/lj/ps_flow_small.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/ps_flow_small.yaml
|
3 |
+
- ./base_text2mel.yaml
|
egs/datasets/audio/lj/ps_flow_small_nips2021.yaml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- ./ps_flow_small.yaml
|
3 |
+
max_sentences: 128
|
4 |
+
dur_level: word
|
5 |
+
use_word_encoder: false
|
6 |
+
enc_prenet: true
|
7 |
+
enc_pre_ln: false
|
8 |
+
fvae_encoder_type: wn
|
9 |
+
fvae_decoder_type: wn
|
10 |
+
text_encoder_postnet: false
|
11 |
+
warmup_updates: 8000
|
egs/datasets/audio/lj/synta.yaml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/tts/synta.yaml
|
3 |
+
- ./base_text2mel.yaml
|
4 |
+
|
5 |
+
lambda_mel_adv: 0.05
|
6 |
+
|
7 |
+
disc_win_num: 3
|
8 |
+
mel_disc_hidden_size: 128
|
9 |
+
disc_norm: in
|
10 |
+
disc_reduction: stack
|
11 |
+
disc_interval: 1
|
12 |
+
disc_lr: 0.0001
|
13 |
+
disc_start_steps: 0
|
14 |
+
discriminator_scheduler_params:
|
15 |
+
gamma: 0.5
|
16 |
+
step_size: 40000
|
17 |
+
discriminator_optimizer_params:
|
18 |
+
eps: 1.0e-06
|
19 |
+
weight_decay: 0.0
|
egs/egs_bases/config_base.yaml
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# task
|
2 |
+
binary_data_dir: ''
|
3 |
+
work_dir: '' # experiment directory.
|
4 |
+
infer: false # infer
|
5 |
+
amp: false
|
6 |
+
seed: 1234
|
7 |
+
debug: false
|
8 |
+
save_codes: ['tasks', 'modules', 'egs']
|
9 |
+
|
10 |
+
#############
|
11 |
+
# dataset
|
12 |
+
#############
|
13 |
+
ds_workers: 1
|
14 |
+
test_num: 100
|
15 |
+
endless_ds: true
|
16 |
+
sort_by_len: true
|
17 |
+
|
18 |
+
#########
|
19 |
+
# train and eval
|
20 |
+
#########
|
21 |
+
print_nan_grads: false
|
22 |
+
load_ckpt: ''
|
23 |
+
save_best: false
|
24 |
+
num_ckpt_keep: 3
|
25 |
+
clip_grad_norm: 0
|
26 |
+
accumulate_grad_batches: 1
|
27 |
+
tb_log_interval: 100
|
28 |
+
num_sanity_val_steps: 5 # steps of validation at the beginning
|
29 |
+
check_val_every_n_epoch: 10
|
30 |
+
val_check_interval: 2000
|
31 |
+
valid_monitor_key: 'val_loss'
|
32 |
+
valid_monitor_mode: 'min'
|
33 |
+
max_epochs: 1000
|
34 |
+
max_updates: 1000000
|
35 |
+
max_tokens: 40000
|
36 |
+
max_sentences: 100000
|
37 |
+
max_valid_tokens: -1
|
38 |
+
max_valid_sentences: -1
|
39 |
+
eval_max_batches: -1
|
40 |
+
resume_from_checkpoint: 0
|
41 |
+
rename_tmux: true
|
egs/egs_bases/tts/base.yaml
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# task
|
2 |
+
base_config:
|
3 |
+
- ../config_base.yaml
|
4 |
+
- ./dataset_params.yaml
|
5 |
+
|
6 |
+
#############
|
7 |
+
# dataset in training
|
8 |
+
#############
|
9 |
+
endless_ds: true
|
10 |
+
min_frames: 0
|
11 |
+
max_frames: 1548
|
12 |
+
frames_multiple: 1
|
13 |
+
max_input_tokens: 1550
|
14 |
+
ds_workers: 1
|
15 |
+
|
16 |
+
#########
|
17 |
+
# model
|
18 |
+
#########
|
19 |
+
use_spk_id: false
|
20 |
+
use_spk_embed: false
|
21 |
+
mel_losses: "ssim:0.5|l1:0.5"
|
22 |
+
|
23 |
+
###########
|
24 |
+
# optimization
|
25 |
+
###########
|
26 |
+
lr: 0.0005
|
27 |
+
scheduler: warmup # rsqrt|warmup|none
|
28 |
+
warmup_updates: 4000
|
29 |
+
optimizer_adam_beta1: 0.9
|
30 |
+
optimizer_adam_beta2: 0.98
|
31 |
+
weight_decay: 0
|
32 |
+
clip_grad_norm: 1
|
33 |
+
clip_grad_value: 0
|
34 |
+
|
35 |
+
|
36 |
+
###########
|
37 |
+
# train and eval
|
38 |
+
###########
|
39 |
+
use_word_input: false
|
40 |
+
max_valid_sentences: 1
|
41 |
+
max_valid_tokens: 60000
|
42 |
+
valid_infer_interval: 10000
|
43 |
+
train_set_name: 'train'
|
44 |
+
train_sets: ''
|
45 |
+
valid_set_name: 'valid'
|
46 |
+
test_set_name: 'test'
|
47 |
+
num_valid_plots: 10
|
48 |
+
test_ids: [ ]
|
49 |
+
test_input_yaml: ''
|
50 |
+
vocoder: HifiGAN
|
51 |
+
vocoder_ckpt: ''
|
52 |
+
profile_infer: false
|
53 |
+
out_wav_norm: false
|
54 |
+
save_gt: true
|
55 |
+
save_f0: false
|
56 |
+
gen_dir_name: ''
|
egs/egs_bases/tts/base_zh.yaml
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: ./base.yaml
|
2 |
+
preprocess_args:
|
3 |
+
txt_processor: zh
|
4 |
+
|
5 |
+
word_size: 3000
|
egs/egs_bases/tts/dataset_params.yaml
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio_num_mel_bins: 80
|
2 |
+
audio_sample_rate: 22050
|
3 |
+
hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
|
4 |
+
win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
|
5 |
+
fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter
|
6 |
+
fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
|
7 |
+
fmax: 7600 # To be increased/reduced depending on data.
|
8 |
+
f0_min: 80
|
9 |
+
f0_max: 800
|
10 |
+
griffin_lim_iters: 30
|
11 |
+
pitch_extractor: parselmouth
|
12 |
+
num_spk: 1
|
13 |
+
mel_vmin: -6
|
14 |
+
mel_vmax: 1.5
|
15 |
+
loud_norm: false
|
16 |
+
|
17 |
+
raw_data_dir: ''
|
18 |
+
processed_data_dir: ''
|
19 |
+
binary_data_dir: ''
|
20 |
+
preprocess_cls: ''
|
21 |
+
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
|
22 |
+
preprocess_args:
|
23 |
+
nsample_per_mfa_group: 1000
|
24 |
+
# text process
|
25 |
+
txt_processor: en
|
26 |
+
use_mfa: true
|
27 |
+
with_phsep: true
|
28 |
+
reset_phone_dict: true
|
29 |
+
reset_word_dict: true
|
30 |
+
add_eos_bos: true
|
31 |
+
# mfa
|
32 |
+
mfa_group_shuffle: false
|
33 |
+
mfa_offset: 0.02
|
34 |
+
# wav processors
|
35 |
+
wav_processors: [ ]
|
36 |
+
save_sil_mask: true
|
37 |
+
vad_max_silence_length: 12
|
38 |
+
binarization_args:
|
39 |
+
shuffle: false
|
40 |
+
with_wav: false
|
41 |
+
with_align: true
|
42 |
+
with_spk_embed: false
|
43 |
+
with_f0: true
|
44 |
+
with_f0cwt: false
|
45 |
+
with_linear: false
|
46 |
+
trim_eos_bos: false
|
47 |
+
min_sil_duration: 0.1
|
48 |
+
train_range: [ 200, -1 ]
|
49 |
+
test_range: [ 0, 100 ]
|
50 |
+
valid_range: [ 100, 200 ]
|
51 |
+
word_dict_size: 10000
|
52 |
+
pitch_key: pitch
|
egs/egs_bases/tts/ds.yaml
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: ./fs2_orig.yaml
|
2 |
+
|
3 |
+
# special configs for diffspeech
|
4 |
+
task_cls: tasks.tts.diffspeech.DiffSpeechTask
|
5 |
+
lr: 0.001
|
6 |
+
timesteps: 100
|
7 |
+
K_step: 71
|
8 |
+
diff_loss_type: l1
|
9 |
+
diff_decoder_type: 'wavenet'
|
10 |
+
schedule_type: 'linear'
|
11 |
+
max_beta: 0.06
|
12 |
+
|
13 |
+
## model configs for diffspeech
|
14 |
+
dilation_cycle_length: 1
|
15 |
+
residual_layers: 20
|
16 |
+
residual_channels: 256
|
17 |
+
decay_steps: 50000
|
18 |
+
keep_bins: 80
|
19 |
+
#content_cond_steps: [ ] # [ 0, 10000 ]
|
20 |
+
#spk_cond_steps: [ ] # [ 0, 10000 ]
|
21 |
+
#gen_tgt_spk_id: -1
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
# training configs for diffspeech
|
26 |
+
#max_sentences: 48
|
27 |
+
#num_sanity_val_steps: 1
|
28 |
+
num_valid_plots: 10
|
29 |
+
use_gt_dur: false
|
30 |
+
use_gt_f0: false
|
31 |
+
use_energy_embed: false
|
32 |
+
#pitch_type: cwt
|
33 |
+
max_updates: 160000
|
egs/egs_bases/tts/fs.yaml
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: ./base.yaml
|
2 |
+
task_cls: tasks.tts.fs.FastSpeechTask
|
3 |
+
|
4 |
+
# model
|
5 |
+
hidden_size: 256
|
6 |
+
dropout: 0.0
|
7 |
+
encoder_type: rel_fft # rel_fft|fft|tacotron|tacotron2|conformer
|
8 |
+
decoder_type: conv # fft|rnn|conv|conformer|wn
|
9 |
+
|
10 |
+
# rnn enc/dec
|
11 |
+
encoder_K: 8
|
12 |
+
decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2
|
13 |
+
|
14 |
+
# fft enc/dec
|
15 |
+
enc_layers: 4
|
16 |
+
enc_ffn_kernel_size: 9
|
17 |
+
enc_prenet: true
|
18 |
+
enc_pre_ln: true
|
19 |
+
dec_layers: 4
|
20 |
+
dec_ffn_kernel_size: 9
|
21 |
+
num_heads: 2
|
22 |
+
ffn_act: gelu
|
23 |
+
ffn_hidden_size: 1024
|
24 |
+
use_pos_embed: true
|
25 |
+
|
26 |
+
# conv enc/dec
|
27 |
+
enc_dec_norm: ln
|
28 |
+
conv_use_pos: false
|
29 |
+
layers_in_block: 2
|
30 |
+
enc_dilations: [ 1, 1, 1, 1 ]
|
31 |
+
enc_kernel_size: 5
|
32 |
+
enc_post_net_kernel: 3
|
33 |
+
dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder
|
34 |
+
dec_kernel_size: 5
|
35 |
+
dec_post_net_kernel: 3
|
36 |
+
|
37 |
+
# duration
|
38 |
+
predictor_hidden: -1
|
39 |
+
dur_predictor_kernel: 3
|
40 |
+
dur_predictor_layers: 2
|
41 |
+
predictor_kernel: 5
|
42 |
+
predictor_layers: 5
|
43 |
+
predictor_dropout: 0.5
|
44 |
+
|
45 |
+
# pitch and energy
|
46 |
+
use_pitch_embed: false
|
47 |
+
pitch_type: frame # frame|ph|cwt
|
48 |
+
use_uv: true
|
49 |
+
|
50 |
+
# reference encoder and speaker embedding
|
51 |
+
lambda_commit: 0.25
|
52 |
+
ref_norm_layer: bn
|
53 |
+
dec_inp_add_noise: false
|
54 |
+
|
55 |
+
# mel
|
56 |
+
mel_losses: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
|
57 |
+
|
58 |
+
# loss lambda
|
59 |
+
lambda_f0: 1.0
|
60 |
+
lambda_uv: 1.0
|
61 |
+
lambda_energy: 0.1
|
62 |
+
lambda_ph_dur: 0.1
|
63 |
+
lambda_sent_dur: 1.0
|
64 |
+
lambda_word_dur: 1.0
|
65 |
+
predictor_grad: 0.1
|
66 |
+
|
67 |
+
# train and eval
|
68 |
+
warmup_updates: 4000
|
69 |
+
max_tokens: 40000
|
70 |
+
max_sentences: 128
|
71 |
+
max_valid_sentences: 1
|
72 |
+
max_updates: 160000
|
73 |
+
use_gt_dur: false
|
74 |
+
use_gt_f0: false
|
75 |
+
ds_workers: 2
|
egs/egs_bases/tts/fs2_orig.yaml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: ./fs.yaml
|
2 |
+
task_cls: tasks.tts.fs2_orig.FastSpeech2OrigTask
|
3 |
+
encoder_type: fft
|
4 |
+
decoder_type: fft
|
5 |
+
use_energy_embed: false
|
6 |
+
use_pitch_embed: true
|
7 |
+
pitch_type: cwt # frame|ph|cwt
|
8 |
+
binarization_args:
|
9 |
+
with_f0cwt: true
|
10 |
+
use_gt_energy: false
|
11 |
+
cwt_std_scale: 0.8
|
12 |
+
dropout: 0.1
|
13 |
+
mel_losses: l1
|
egs/egs_bases/tts/ps.yaml
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: ./fs.yaml
|
2 |
+
|
3 |
+
###########################
|
4 |
+
# models
|
5 |
+
###########################
|
6 |
+
# encoders
|
7 |
+
hidden_size: 192
|
8 |
+
ffn_hidden_size: 768
|
9 |
+
enc_ffn_kernel_size: 5
|
10 |
+
enc_layers: 4
|
11 |
+
dur_level: word
|
12 |
+
encoder_type: rel_fft
|
13 |
+
use_word_encoder: true
|
14 |
+
|
15 |
+
# mix ling encoder
|
16 |
+
word_enc_layers: 4
|
17 |
+
word_encoder_type: rel_fft
|
18 |
+
use_pitch_embed: false
|
19 |
+
enc_prenet: true
|
20 |
+
enc_pre_ln: true
|
21 |
+
text_encoder_postnet: true
|
22 |
+
dropout: 0.0
|
23 |
+
add_word_pos: true
|
24 |
+
|
25 |
+
# dur predictor
|
26 |
+
dur_predictor_layers: 3
|
27 |
+
dur_predictor_kernel: 5
|
28 |
+
predictor_dropout: 0.2
|
29 |
+
|
30 |
+
## fvae
|
31 |
+
use_fvae: true
|
32 |
+
latent_size: 16
|
33 |
+
fvae_encoder_type: conv
|
34 |
+
fvae_decoder_type: conv
|
35 |
+
fvae_enc_dec_hidden: 192
|
36 |
+
fvae_kernel_size: 5
|
37 |
+
fvae_enc_n_layers: 8
|
38 |
+
fvae_dec_n_layers: 4
|
39 |
+
fvae_strides: 4
|
40 |
+
fvae_noise_scale: 1.0
|
41 |
+
|
42 |
+
# prior flow
|
43 |
+
use_prior_flow: true
|
44 |
+
prior_flow_hidden: 64
|
45 |
+
prior_flow_kernel_size: 3
|
46 |
+
prior_flow_n_blocks: 4
|
47 |
+
|
48 |
+
###########################
|
49 |
+
# training and inference
|
50 |
+
###########################
|
51 |
+
lambda_kl: 1.0
|
52 |
+
kl_min: 0.0
|
53 |
+
lambda_sent_dur: 0.0
|
54 |
+
kl_start_steps: 10000
|
55 |
+
posterior_start_steps: 0
|
56 |
+
frames_multiple: 4
|
57 |
+
num_valid_plots: 10
|
58 |
+
lr: 0.0002
|
59 |
+
warmup_updates: 8000
|
60 |
+
max_tokens: 40000
|
61 |
+
valid_infer_interval: 10000
|
62 |
+
max_sentences: 80
|
63 |
+
max_updates: 480000
|
egs/egs_bases/tts/ps_flow.yaml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: ./ps.yaml
|
2 |
+
task_cls: tasks.tts.ps_flow.PortaSpeechFlowTask
|
3 |
+
|
4 |
+
use_post_flow: true
|
5 |
+
detach_postflow_input: true
|
6 |
+
post_flow_lr: 0.001
|
7 |
+
post_glow_hidden: 192
|
8 |
+
post_glow_kernel_size: 3
|
9 |
+
post_glow_n_blocks: 12
|
10 |
+
post_glow_n_block_layers: 3
|
11 |
+
post_share_cond_layers: false
|
12 |
+
share_wn_layers: 4
|
13 |
+
use_cond_proj: false
|
14 |
+
use_latent_cond: false
|
15 |
+
use_txt_cond: true
|
16 |
+
sigmoid_scale: false
|
17 |
+
post_glow_training_start: 160000
|
18 |
+
noise_scale: 0.8
|
19 |
+
infer_post_glow: true
|
20 |
+
two_stage: true
|
egs/egs_bases/tts/ps_flow_small.yaml
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: ./ps_flow.yaml
|
2 |
+
|
3 |
+
###########################
|
4 |
+
# models
|
5 |
+
###########################
|
6 |
+
# encoders
|
7 |
+
hidden_size: 128
|
8 |
+
ffn_hidden_size: 512
|
9 |
+
enc_ffn_kernel_size: 3
|
10 |
+
enc_layers: 3
|
11 |
+
word_enc_layers: 3
|
12 |
+
|
13 |
+
# dur predictor
|
14 |
+
dur_predictor_layers: 3
|
15 |
+
dur_predictor_kernel: 5
|
16 |
+
predictor_dropout: 0.2
|
17 |
+
|
18 |
+
## fvae
|
19 |
+
use_fvae: true
|
20 |
+
latent_size: 16
|
21 |
+
fvae_encoder_type: wn
|
22 |
+
fvae_decoder_type: wn
|
23 |
+
fvae_enc_dec_hidden: 128
|
24 |
+
fvae_kernel_size: 3
|
25 |
+
fvae_enc_n_layers: 8
|
26 |
+
fvae_dec_n_layers: 3
|
27 |
+
fvae_strides: 4
|
28 |
+
fvae_noise_scale: 1.0
|
29 |
+
|
30 |
+
|
31 |
+
# prior flow
|
32 |
+
use_prior_flow: true
|
33 |
+
prior_flow_hidden: 32
|
34 |
+
prior_flow_kernel_size: 3
|
35 |
+
prior_flow_n_blocks: 3
|
36 |
+
# post flow
|
37 |
+
post_glow_hidden: 128
|
38 |
+
post_glow_kernel_size: 3
|
39 |
+
post_glow_n_blocks: 8
|
40 |
+
post_glow_n_block_layers: 3
|
41 |
+
share_wn_layers: 4
|
42 |
+
noise_scale: 0.6
|
egs/egs_bases/tts/synta.yaml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: ./ps.yaml
|
2 |
+
task_cls: tasks.tts.synta.SyntaSpeechTask
|
3 |
+
|
4 |
+
use_post_flow: true
|
5 |
+
detach_postflow_input: true
|
6 |
+
post_flow_lr: 0.001
|
7 |
+
post_glow_hidden: 192
|
8 |
+
post_glow_kernel_size: 3
|
9 |
+
post_glow_n_blocks: 12
|
10 |
+
post_glow_n_block_layers: 3
|
11 |
+
post_share_cond_layers: false
|
12 |
+
share_wn_layers: 4
|
13 |
+
use_cond_proj: false
|
14 |
+
use_latent_cond: false
|
15 |
+
use_txt_cond: true
|
16 |
+
sigmoid_scale: false
|
17 |
+
post_glow_training_start: 160000
|
18 |
+
noise_scale: 0.8
|
19 |
+
infer_post_glow: true
|
20 |
+
two_stage: true
|
egs/egs_bases/tts/vocoder/base.yaml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config:
|
2 |
+
- egs/egs_bases/config_base.yaml
|
3 |
+
- ../dataset_params.yaml
|
4 |
+
binarization_args:
|
5 |
+
with_wav: true
|
6 |
+
with_spk_embed: false
|
7 |
+
with_align: false
|
8 |
+
|
9 |
+
generator_grad_norm: 10.0 # Generator's gradient norm.
|
10 |
+
discriminator_grad_norm: 1.0 # Discriminator's gradient norm.
|
11 |
+
|
12 |
+
###########
|
13 |
+
# train and eval
|
14 |
+
###########
|
15 |
+
max_samples: 20480
|
16 |
+
max_sentences: 8
|
17 |
+
max_valid_sentences: 1
|
18 |
+
max_updates: 2000000
|
19 |
+
val_check_interval: 5000
|
20 |
+
valid_infer_interval: 50000
|
egs/egs_bases/tts/vocoder/hifigan.yaml
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_config: ./base.yaml
|
2 |
+
task_cls: tasks.vocoder.hifigan.HifiGanTask
|
3 |
+
resblock: "1"
|
4 |
+
adam_b1: 0.8
|
5 |
+
adam_b2: 0.99
|
6 |
+
upsample_rates: [ 8,8,2,2 ]
|
7 |
+
upsample_kernel_sizes: [ 16,16,4,4 ]
|
8 |
+
upsample_initial_channel: 512
|
9 |
+
resblock_kernel_sizes: [ 3,7,11 ]
|
10 |
+
resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
|
11 |
+
|
12 |
+
use_pitch_embed: false
|
13 |
+
use_fm_loss: false
|
14 |
+
use_ms_stft: false
|
15 |
+
|
16 |
+
lambda_mel: 5.0
|
17 |
+
lambda_mel_adv: 1.0
|
18 |
+
lambda_cdisc: 4.0
|
19 |
+
lambda_adv: 1.0
|
20 |
+
|
21 |
+
lr: 0.0002 # Generator's learning rate.
|
22 |
+
generator_scheduler_params:
|
23 |
+
step_size: 600
|
24 |
+
gamma: 0.999
|
25 |
+
discriminator_scheduler_params:
|
26 |
+
step_size: 600
|
27 |
+
gamma: 0.999
|
28 |
+
max_updates: 3000000
|
inference/tts/__pycache__/base_tts_infer.cpython-36.pyc
ADDED
Binary file (4.19 kB). View file
|
|
inference/tts/base_tts_infer.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
from modules.vocoder.hifigan.hifigan import HifiGanGenerator
|
6 |
+
from tasks.tts.dataset_utils import FastSpeechWordDataset
|
7 |
+
from tasks.tts.tts_utils import load_data_preprocessor
|
8 |
+
from utils.commons.ckpt_utils import load_ckpt
|
9 |
+
from utils.commons.hparams import set_hparams
|
10 |
+
|
11 |
+
|
12 |
+
class BaseTTSInfer:
|
13 |
+
def __init__(self, hparams, device=None):
|
14 |
+
if device is None:
|
15 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
16 |
+
self.hparams = hparams
|
17 |
+
self.device = device
|
18 |
+
self.data_dir = hparams['binary_data_dir']
|
19 |
+
self.preprocessor, self.preprocess_args = load_data_preprocessor()
|
20 |
+
self.ph_encoder, self.word_encoder = self.preprocessor.load_dict(self.data_dir)
|
21 |
+
self.spk_map = self.preprocessor.load_spk_map(self.data_dir)
|
22 |
+
self.ds_cls = FastSpeechWordDataset
|
23 |
+
self.model = self.build_model()
|
24 |
+
self.model.eval()
|
25 |
+
self.model.to(self.device)
|
26 |
+
self.vocoder = self.build_vocoder()
|
27 |
+
self.vocoder.eval()
|
28 |
+
self.vocoder.to(self.device)
|
29 |
+
|
30 |
+
def build_model(self):
|
31 |
+
raise NotImplementedError
|
32 |
+
|
33 |
+
def forward_model(self, inp):
|
34 |
+
raise NotImplementedError
|
35 |
+
|
36 |
+
def build_vocoder(self):
|
37 |
+
base_dir = self.hparams['vocoder_ckpt']
|
38 |
+
config_path = f'{base_dir}/config.yaml'
|
39 |
+
config = set_hparams(config_path, global_hparams=False)
|
40 |
+
vocoder = HifiGanGenerator(config)
|
41 |
+
load_ckpt(vocoder, base_dir, 'model_gen')
|
42 |
+
return vocoder
|
43 |
+
|
44 |
+
def run_vocoder(self, c):
|
45 |
+
c = c.transpose(2, 1)
|
46 |
+
y = self.vocoder(c)[:, 0]
|
47 |
+
return y
|
48 |
+
|
49 |
+
def preprocess_input(self, inp):
|
50 |
+
"""
|
51 |
+
|
52 |
+
:param inp: {'text': str, 'item_name': (str, optional), 'spk_name': (str, optional)}
|
53 |
+
:return:
|
54 |
+
"""
|
55 |
+
preprocessor, preprocess_args = self.preprocessor, self.preprocess_args
|
56 |
+
text_raw = inp['text']
|
57 |
+
item_name = inp.get('item_name', '<ITEM_NAME>')
|
58 |
+
spk_name = inp.get('spk_name', '<SINGLE_SPK>')
|
59 |
+
ph, txt, word, ph2word, ph_gb_word = preprocessor.txt_to_ph(
|
60 |
+
preprocessor.txt_processor, text_raw, preprocess_args)
|
61 |
+
word_token = self.word_encoder.encode(word)
|
62 |
+
ph_token = self.ph_encoder.encode(ph)
|
63 |
+
spk_id = self.spk_map[spk_name]
|
64 |
+
item = {'item_name': item_name, 'text': txt, 'ph': ph, 'spk_id': spk_id,
|
65 |
+
'ph_token': ph_token, 'word_token': word_token, 'ph2word': ph2word,
|
66 |
+
'ph_words':ph_gb_word, 'words': word}
|
67 |
+
item['ph_len'] = len(item['ph_token'])
|
68 |
+
return item
|
69 |
+
|
70 |
+
def input_to_batch(self, item):
|
71 |
+
item_names = [item['item_name']]
|
72 |
+
text = [item['text']]
|
73 |
+
ph = [item['ph']]
|
74 |
+
txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
|
75 |
+
txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
|
76 |
+
word_tokens = torch.LongTensor(item['word_token'])[None, :].to(self.device)
|
77 |
+
word_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
|
78 |
+
ph2word = torch.LongTensor(item['ph2word'])[None, :].to(self.device)
|
79 |
+
spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)
|
80 |
+
batch = {
|
81 |
+
'item_name': item_names,
|
82 |
+
'text': text,
|
83 |
+
'ph': ph,
|
84 |
+
'txt_tokens': txt_tokens,
|
85 |
+
'txt_lengths': txt_lengths,
|
86 |
+
'word_tokens': word_tokens,
|
87 |
+
'word_lengths': word_lengths,
|
88 |
+
'ph2word': ph2word,
|
89 |
+
'spk_ids': spk_ids,
|
90 |
+
}
|
91 |
+
return batch
|
92 |
+
|
93 |
+
def postprocess_output(self, output):
|
94 |
+
return output
|
95 |
+
|
96 |
+
def infer_once(self, inp):
|
97 |
+
inp = self.preprocess_input(inp)
|
98 |
+
output = self.forward_model(inp)
|
99 |
+
output = self.postprocess_output(output)
|
100 |
+
return output
|
101 |
+
|
102 |
+
@classmethod
|
103 |
+
def example_run(cls):
|
104 |
+
from utils.commons.hparams import set_hparams
|
105 |
+
from utils.commons.hparams import hparams as hp
|
106 |
+
from utils.audio.io import save_wav
|
107 |
+
|
108 |
+
set_hparams()
|
109 |
+
if hp['ds_name'] in ['lj', 'libritts']:
|
110 |
+
inp = {
|
111 |
+
'text': 'the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.'
|
112 |
+
}
|
113 |
+
elif hp['ds_name'] in ['biaobei']:
|
114 |
+
inp = {
|
115 |
+
'text': '如果我想你三遍,天上乌云就散一片。'
|
116 |
+
}
|
117 |
+
infer_ins = cls(hp)
|
118 |
+
out = infer_ins.infer_once(inp)
|
119 |
+
os.makedirs('infer_out', exist_ok=True)
|
120 |
+
save_wav(out, f'infer_out/example_out.wav', hp['audio_sample_rate'])
|
inference/tts/ds.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
# from inference.tts.fs import FastSpeechInfer
|
3 |
+
# from modules.tts.fs2_orig import FastSpeech2Orig
|
4 |
+
from inference.tts.base_tts_infer import BaseTTSInfer
|
5 |
+
from modules.tts.diffspeech.shallow_diffusion_tts import GaussianDiffusion
|
6 |
+
from utils.commons.ckpt_utils import load_ckpt
|
7 |
+
from utils.commons.hparams import hparams
|
8 |
+
|
9 |
+
|
10 |
+
class DiffSpeechInfer(BaseTTSInfer):
|
11 |
+
def build_model(self):
|
12 |
+
dict_size = len(self.ph_encoder)
|
13 |
+
model = GaussianDiffusion(dict_size, self.hparams)
|
14 |
+
model.eval()
|
15 |
+
load_ckpt(model, hparams['work_dir'], 'model')
|
16 |
+
return model
|
17 |
+
|
18 |
+
def forward_model(self, inp):
|
19 |
+
sample = self.input_to_batch(inp)
|
20 |
+
txt_tokens = sample['txt_tokens'] # [B, T_t]
|
21 |
+
spk_id = sample.get('spk_ids')
|
22 |
+
with torch.no_grad():
|
23 |
+
output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True)
|
24 |
+
mel_out = output['mel_out']
|
25 |
+
wav_out = self.run_vocoder(mel_out)
|
26 |
+
wav_out = wav_out.cpu().numpy()
|
27 |
+
return wav_out[0]
|
28 |
+
|
29 |
+
if __name__ == '__main__':
|
30 |
+
DiffSpeechInfer.example_run()
|
inference/tts/fs.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from inference.tts.base_tts_infer import BaseTTSInfer
|
3 |
+
from modules.tts.fs import FastSpeech
|
4 |
+
from utils.commons.ckpt_utils import load_ckpt
|
5 |
+
from utils.commons.hparams import hparams
|
6 |
+
|
7 |
+
|
8 |
+
class FastSpeechInfer(BaseTTSInfer):
|
9 |
+
def build_model(self):
|
10 |
+
dict_size = len(self.ph_encoder)
|
11 |
+
model = FastSpeech(dict_size, self.hparams)
|
12 |
+
model.eval()
|
13 |
+
load_ckpt(model, hparams['work_dir'], 'model')
|
14 |
+
return model
|
15 |
+
|
16 |
+
def forward_model(self, inp):
|
17 |
+
sample = self.input_to_batch(inp)
|
18 |
+
txt_tokens = sample['txt_tokens'] # [B, T_t]
|
19 |
+
spk_id = sample.get('spk_ids')
|
20 |
+
with torch.no_grad():
|
21 |
+
output = self.model(txt_tokens, spk_id=spk_id, infer=True)
|
22 |
+
mel_out = output['mel_out']
|
23 |
+
wav_out = self.run_vocoder(mel_out)
|
24 |
+
wav_out = wav_out.cpu().numpy()
|
25 |
+
return wav_out[0]
|
26 |
+
|
27 |
+
|
28 |
+
if __name__ == '__main__':
|
29 |
+
FastSpeechInfer.example_run()
|
inference/tts/fs2_orig.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from inference.tts.fs import FastSpeechInfer
|
2 |
+
from modules.tts.fs2_orig import FastSpeech2Orig
|
3 |
+
from utils.commons.ckpt_utils import load_ckpt
|
4 |
+
from utils.commons.hparams import hparams
|
5 |
+
|
6 |
+
|
7 |
+
class FastSpeech2OrigInfer(FastSpeechInfer):
|
8 |
+
def build_model(self):
|
9 |
+
dict_size = len(self.ph_encoder)
|
10 |
+
model = FastSpeech2Orig(dict_size, self.hparams)
|
11 |
+
model.eval()
|
12 |
+
load_ckpt(model, hparams['work_dir'], 'model')
|
13 |
+
return model
|
14 |
+
|
15 |
+
|
16 |
+
if __name__ == '__main__':
|
17 |
+
FastSpeech2OrigInfer.example_run()
|
inference/tts/gradio/gradio_settings.yaml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
title: 'yerfor/SyntaSpeech'
|
2 |
+
description: |
|
3 |
+
Gradio demo for yerfor/SyntaSpeech. To use it, simply add your audio, or click one of the examples to load them. Note: This space is running on CPU, inference times will be higher.
|
4 |
+
article: |
|
5 |
+
Link to <a href='https://github.com/yerfor/SyntaSpeech' style='color:blue;' target='_blank\'>Github REPO</a>
|
6 |
+
example_inputs:
|
7 |
+
- |-
|
8 |
+
the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.
|
9 |
+
- |-
|
10 |
+
produced the block books, which were the immediate predecessors of the true printed book,
|
11 |
+
inference_cls: inference.tts.synta.SyntaSpeechInfer
|
12 |
+
exp_name: lj_synta
|
inference/tts/gradio/infer.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import importlib
|
2 |
+
import re
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import yaml
|
6 |
+
from gradio.inputs import Textbox
|
7 |
+
|
8 |
+
from inference.tts.base_tts_infer import BaseTTSInfer
|
9 |
+
from utils.commons.hparams import set_hparams
|
10 |
+
from utils.commons.hparams import hparams as hp
|
11 |
+
import numpy as np
|
12 |
+
|
13 |
+
from utils.text.text_encoder import PUNCS
|
14 |
+
|
15 |
+
|
16 |
+
class GradioInfer:
|
17 |
+
def __init__(self, exp_name, inference_cls, title, description, article, example_inputs):
|
18 |
+
self.exp_name = exp_name
|
19 |
+
self.title = title
|
20 |
+
self.description = description
|
21 |
+
self.article = article
|
22 |
+
self.example_inputs = example_inputs
|
23 |
+
pkg = ".".join(inference_cls.split(".")[:-1])
|
24 |
+
cls_name = inference_cls.split(".")[-1]
|
25 |
+
self.inference_cls = getattr(importlib.import_module(pkg), cls_name)
|
26 |
+
|
27 |
+
def greet(self, text):
|
28 |
+
sents = re.split(rf'([{PUNCS}])', text.replace('\n', ','))
|
29 |
+
if sents[-1] not in list(PUNCS):
|
30 |
+
sents = sents + ['.']
|
31 |
+
audio_outs = []
|
32 |
+
s = ""
|
33 |
+
for i in range(0, len(sents), 2):
|
34 |
+
if len(sents[i]) > 0:
|
35 |
+
s += sents[i] + sents[i + 1]
|
36 |
+
if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0):
|
37 |
+
audio_out = self.infer_ins.infer_once({
|
38 |
+
'text': s
|
39 |
+
})
|
40 |
+
audio_out = audio_out * 32767
|
41 |
+
audio_out = audio_out.astype(np.int16)
|
42 |
+
audio_outs.append(audio_out)
|
43 |
+
audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16))
|
44 |
+
s = ""
|
45 |
+
audio_outs = np.concatenate(audio_outs)
|
46 |
+
return hp['audio_sample_rate'], audio_outs
|
47 |
+
|
48 |
+
def run(self):
|
49 |
+
set_hparams(exp_name=self.exp_name)
|
50 |
+
infer_cls = self.inference_cls
|
51 |
+
self.infer_ins: BaseTTSInfer = infer_cls(hp)
|
52 |
+
example_inputs = self.example_inputs
|
53 |
+
iface = gr.Interface(fn=self.greet,
|
54 |
+
inputs=Textbox(
|
55 |
+
lines=10, placeholder=None, default=example_inputs[0], label="input text"),
|
56 |
+
outputs="audio",
|
57 |
+
allow_flagging="never",
|
58 |
+
title=self.title,
|
59 |
+
description=self.description,
|
60 |
+
article=self.article,
|
61 |
+
examples=example_inputs,
|
62 |
+
enable_queue=True)
|
63 |
+
iface.launch(share=True,cache_examples=True)
|
64 |
+
|
65 |
+
|
66 |
+
if __name__ == '__main__':
|
67 |
+
gradio_config = yaml.safe_load(open('inference/tts/gradio/gradio_settings.yaml'))
|
68 |
+
g = GradioInfer(**gradio_config)
|
69 |
+
g.run()
|
inference/tts/ps_flow.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from inference.tts.base_tts_infer import BaseTTSInfer
|
3 |
+
from modules.tts.portaspeech.portaspeech_flow import PortaSpeechFlow
|
4 |
+
from utils.commons.ckpt_utils import load_ckpt
|
5 |
+
from utils.commons.hparams import hparams
|
6 |
+
|
7 |
+
|
8 |
+
class PortaSpeechFlowInfer(BaseTTSInfer):
|
9 |
+
def build_model(self):
|
10 |
+
ph_dict_size = len(self.ph_encoder)
|
11 |
+
word_dict_size = len(self.word_encoder)
|
12 |
+
model = PortaSpeechFlow(ph_dict_size, word_dict_size, self.hparams)
|
13 |
+
load_ckpt(model, hparams['work_dir'], 'model')
|
14 |
+
model.to(self.device)
|
15 |
+
with torch.no_grad():
|
16 |
+
model.store_inverse_all()
|
17 |
+
model.eval()
|
18 |
+
return model
|
19 |
+
|
20 |
+
def forward_model(self, inp):
|
21 |
+
sample = self.input_to_batch(inp)
|
22 |
+
with torch.no_grad():
|
23 |
+
output = self.model(
|
24 |
+
sample['txt_tokens'],
|
25 |
+
sample['word_tokens'],
|
26 |
+
ph2word=sample['ph2word'],
|
27 |
+
word_len=sample['word_lengths'].max(),
|
28 |
+
infer=True,
|
29 |
+
forward_post_glow=True,
|
30 |
+
spk_id=sample.get('spk_ids')
|
31 |
+
)
|
32 |
+
mel_out = output['mel_out']
|
33 |
+
wav_out = self.run_vocoder(mel_out)
|
34 |
+
wav_out = wav_out.cpu().numpy()
|
35 |
+
return wav_out[0]
|
36 |
+
|
37 |
+
|
38 |
+
if __name__ == '__main__':
|
39 |
+
PortaSpeechFlowInfer.example_run()
|
inference/tts/synta.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from inference.tts.base_tts_infer import BaseTTSInfer
|
3 |
+
from modules.tts.syntaspeech.syntaspeech import SyntaSpeech
|
4 |
+
from utils.commons.ckpt_utils import load_ckpt
|
5 |
+
from utils.commons.hparams import hparams
|
6 |
+
|
7 |
+
from modules.tts.syntaspeech.syntactic_graph_buider import Sentence2GraphParser
|
8 |
+
|
9 |
+
class SyntaSpeechInfer(BaseTTSInfer):
|
10 |
+
def __init__(self, hparams, device=None):
|
11 |
+
super().__init__(hparams, device)
|
12 |
+
if hparams['ds_name'] in ['biaobei']:
|
13 |
+
self.syntactic_graph_builder = Sentence2GraphParser(language='zh')
|
14 |
+
elif hparams['ds_name'] in ['ljspeech', 'libritts']:
|
15 |
+
self.syntactic_graph_builder = Sentence2GraphParser(language='en')
|
16 |
+
|
17 |
+
def build_model(self):
|
18 |
+
ph_dict_size = len(self.ph_encoder)
|
19 |
+
word_dict_size = len(self.word_encoder)
|
20 |
+
model = SyntaSpeech(ph_dict_size, word_dict_size, self.hparams)
|
21 |
+
load_ckpt(model, hparams['work_dir'], 'model')
|
22 |
+
model.to(self.device)
|
23 |
+
with torch.no_grad():
|
24 |
+
model.store_inverse_all()
|
25 |
+
model.eval()
|
26 |
+
return model
|
27 |
+
|
28 |
+
def input_to_batch(self, item):
|
29 |
+
item_names = [item['item_name']]
|
30 |
+
text = [item['text']]
|
31 |
+
ph = [item['ph']]
|
32 |
+
txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
|
33 |
+
txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
|
34 |
+
word_tokens = torch.LongTensor(item['word_token'])[None, :].to(self.device)
|
35 |
+
word_lengths = torch.LongTensor([word_tokens.shape[1]]).to(self.device)
|
36 |
+
ph2word = torch.LongTensor(item['ph2word'])[None, :].to(self.device)
|
37 |
+
spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)
|
38 |
+
dgl_graph, etypes = self.syntactic_graph_builder.parse(item['text'], words=item['words'].split(" "), ph_words=item['ph_words'].split(" "))
|
39 |
+
dgl_graph = dgl_graph.to(self.device)
|
40 |
+
etypes = etypes.to(self.device)
|
41 |
+
batch = {
|
42 |
+
'item_name': item_names,
|
43 |
+
'text': text,
|
44 |
+
'ph': ph,
|
45 |
+
'txt_tokens': txt_tokens,
|
46 |
+
'txt_lengths': txt_lengths,
|
47 |
+
'word_tokens': word_tokens,
|
48 |
+
'word_lengths': word_lengths,
|
49 |
+
'ph2word': ph2word,
|
50 |
+
'spk_ids': spk_ids,
|
51 |
+
'graph_lst': [dgl_graph],
|
52 |
+
'etypes_lst': [etypes]
|
53 |
+
}
|
54 |
+
return batch
|
55 |
+
def forward_model(self, inp):
|
56 |
+
sample = self.input_to_batch(inp)
|
57 |
+
with torch.no_grad():
|
58 |
+
output = self.model(
|
59 |
+
sample['txt_tokens'],
|
60 |
+
sample['word_tokens'],
|
61 |
+
ph2word=sample['ph2word'],
|
62 |
+
word_len=sample['word_lengths'].max(),
|
63 |
+
infer=True,
|
64 |
+
forward_post_glow=True,
|
65 |
+
spk_id=sample.get('spk_ids'),
|
66 |
+
graph_lst=sample['graph_lst'],
|
67 |
+
etypes_lst=sample['etypes_lst']
|
68 |
+
)
|
69 |
+
mel_out = output['mel_out']
|
70 |
+
wav_out = self.run_vocoder(mel_out)
|
71 |
+
wav_out = wav_out.cpu().numpy()
|
72 |
+
return wav_out[0]
|
73 |
+
|
74 |
+
|
75 |
+
if __name__ == '__main__':
|
76 |
+
SyntaSpeechInfer.example_run()
|
modules/commons/__pycache__/conv.cpython-36.pyc
ADDED
Binary file (6.54 kB). View file
|
|