Spaces:

yerfor
/

SyntaSpeech

Build error

App Files Files Community

yerfor commited on May 12, 2022

Commit

22871e7

1 Parent(s): 2874937

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +4 -6
data/binary/ljspeech/phone_set.json +1 -0
data/binary/ljspeech/spk_map.json +1 -0
data/binary/ljspeech/word_set.json +0 -0
egs/datasets/audio/biaobei/__pycache__/preprocess.cpython-36.pyc +0 -0
egs/datasets/audio/biaobei/base_text2mel.yaml +18 -0
egs/datasets/audio/biaobei/preprocess.py +16 -0
egs/datasets/audio/biaobei/ps_flow.yaml +3 -0
egs/datasets/audio/biaobei/synta.yaml +19 -0
egs/datasets/audio/libritts/__pycache__/preprocess.cpython-36.pyc +0 -0
egs/datasets/audio/libritts/base_text2mel.yaml +22 -0
egs/datasets/audio/libritts/preprocess.py +13 -0
egs/datasets/audio/libritts/ps_flow.yaml +3 -0
egs/datasets/audio/libritts/synta.yaml +19 -0
egs/datasets/audio/lj/__pycache__/preprocess.cpython-36.pyc +0 -0
egs/datasets/audio/lj/base_mel2wav.yaml +4 -0
egs/datasets/audio/lj/base_text2mel.yaml +17 -0
egs/datasets/audio/lj/ds.yaml +29 -0
egs/datasets/audio/lj/fs.yaml +3 -0
egs/datasets/audio/lj/fs2_orig.yaml +4 -0
egs/datasets/audio/lj/hifigan.yaml +3 -0
egs/datasets/audio/lj/preprocess.py +9 -0
egs/datasets/audio/lj/ps_flow.yaml +3 -0
egs/datasets/audio/lj/ps_flow_nips2021.yaml +11 -0
egs/datasets/audio/lj/ps_flow_small.yaml +3 -0
egs/datasets/audio/lj/ps_flow_small_nips2021.yaml +11 -0
egs/datasets/audio/lj/synta.yaml +19 -0
egs/egs_bases/config_base.yaml +41 -0
egs/egs_bases/tts/base.yaml +56 -0
egs/egs_bases/tts/base_zh.yaml +5 -0
egs/egs_bases/tts/dataset_params.yaml +52 -0
egs/egs_bases/tts/ds.yaml +33 -0
egs/egs_bases/tts/fs.yaml +75 -0
egs/egs_bases/tts/fs2_orig.yaml +13 -0
egs/egs_bases/tts/ps.yaml +63 -0
egs/egs_bases/tts/ps_flow.yaml +20 -0
egs/egs_bases/tts/ps_flow_small.yaml +42 -0
egs/egs_bases/tts/synta.yaml +20 -0
egs/egs_bases/tts/vocoder/base.yaml +20 -0
egs/egs_bases/tts/vocoder/hifigan.yaml +28 -0
inference/tts/__pycache__/base_tts_infer.cpython-36.pyc +0 -0
inference/tts/base_tts_infer.py +120 -0
inference/tts/ds.py +30 -0
inference/tts/fs.py +29 -0
inference/tts/fs2_orig.py +17 -0
inference/tts/gradio/gradio_settings.yaml +12 -0
inference/tts/gradio/infer.py +69 -0
inference/tts/ps_flow.py +39 -0
inference/tts/synta.py +76 -0
modules/commons/__pycache__/conv.cpython-36.pyc +0 -0

README.md CHANGED Viewed

@@ -1,12 +1,10 @@
 ---
 title: SyntaSpeech
-emoji: 📊
-colorFrom: red
-colorTo: blue
 sdk: gradio
-sdk_version: 2.9.4
-app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference

 ---
 title: SyntaSpeech
+emoji: 🤗
+colorFrom: yellow
+colorTo: orange
 sdk: gradio
+app_file: "inference/tts/gradio/infer.py"
 pinned: false
 ---

data/binary/ljspeech/phone_set.json ADDED Viewed

	@@ -0,0 +1 @@

+ ["!", ",", ".", ":", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH"]

data/binary/ljspeech/spk_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<SINGLE_SPK>": 0}

data/binary/ljspeech/word_set.json ADDED Viewed

The diff for this file is too large to render. See raw diff

egs/datasets/audio/biaobei/__pycache__/preprocess.cpython-36.pyc ADDED Viewed

Binary file (1.12 kB). View file

egs/datasets/audio/biaobei/base_text2mel.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+base_config: egs/egs_bases/tts/base_zh.yaml
+raw_data_dir: 'data/raw/biaobei'
+processed_data_dir: 'data/processed/biaobei'
+binary_data_dir: 'data/binary/biaobei'
+preprocess_cls: egs.datasets.audio.biaobei.preprocess.BiaobeiPreprocess
+ds_name: biaobei
+binarization_args:
+  train_range: [ 871, -1 ]
+  test_range: [ 0, 523 ]
+  valid_range: [ 523, 871 ]
+test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+            68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
+            316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
+f0_min: 80
+f0_max: 600
+vocoder_ckpt: checkpoints/hifi_biaobei

egs/datasets/audio/biaobei/preprocess.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from data_gen.tts.base_preprocess import BasePreprocessor
+import re
+class BiaobeiPreprocess(BasePreprocessor):
+    def meta_data(self):
+        input_dir = self.raw_data_dir
+        with open(f"{input_dir}/ProsodyLabeling/000001-010000.txt", encoding='utf-8') as f:
+            bb_lines = f.readlines()[::2]
+        for l_idx, l in (enumerate([re.sub("\#\d+", "", l.split('\t')[1].strip()) for l in bb_lines])):
+            item_name = f'{l_idx + 1:06d}'
+            wav_fn = f"{input_dir}/wav/{l_idx + 1:06d}.wav"
+            yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': l}
+if __name__ == "__main__":
+    BiaobeiPreprocess().process()

egs/datasets/audio/biaobei/ps_flow.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/ps_flow.yaml
+  - ./base_text2mel.yaml

egs/datasets/audio/biaobei/synta.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+base_config:
+  - egs/egs_bases/tts/synta.yaml
+  - ./base_text2mel.yaml
+lambda_mel_adv: 0.05
+disc_win_num: 3
+mel_disc_hidden_size: 128
+disc_norm: in
+disc_reduction: stack
+disc_interval: 1
+disc_lr: 0.0001
+disc_start_steps: 0
+discriminator_scheduler_params:
+  gamma: 0.5
+  step_size: 40000
+discriminator_optimizer_params:
+  eps: 1.0e-06
+  weight_decay: 0.0

egs/datasets/audio/libritts/__pycache__/preprocess.cpython-36.pyc ADDED Viewed

Binary file (893 Bytes). View file

egs/datasets/audio/libritts/base_text2mel.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+ds_name: libritts
+base_config: egs/egs_bases/tts/base.yaml
+raw_data_dir: 'data/raw/LibriTTS'
+processed_data_dir: 'data/processed/libritts'
+binary_data_dir: 'data/binary/libritts'
+preprocess_cls: egs.datasets.audio.libritts.preprocess.LibriTTSPreprocess
+binarization_args:
+  train_range: [ 871, -1 ]
+  test_range: [ 0, 523 ]
+  valid_range: [ 523, 871 ]
+  shuffle: false
+  with_spk_id: true
+  with_spk_embed: false
+test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+            68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
+            316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
+f0_min: 80
+f0_max: 600
+vocoder: PWG
+vocoder_ckpt: checkpoints/pwg_libritts
+num_spk: 2000

egs/datasets/audio/libritts/preprocess.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from data_gen.tts.base_preprocess import BasePreprocessor
+import glob, os
+class LibriTTSPreprocess(BasePreprocessor):
+    def meta_data(self):
+        wav_fns = sorted(glob.glob(f'{self.raw_data_dir}/*/*/*/*.wav'))
+        for wav_fn in wav_fns:
+            item_name = os.path.basename(wav_fn)[:-4]
+            txt_fn = f'{wav_fn[:-4]}.normalized.txt'
+            with open(txt_fn, 'r') as f:
+                txt = f.read()
+            spk_name = item_name.split("_")[0]
+            yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt, 'spk_name': spk_name}

egs/datasets/audio/libritts/ps_flow.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/ps_flow.yaml
+  - ./base_text2mel.yaml

egs/datasets/audio/libritts/synta.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+base_config:
+  - egs/egs_bases/tts/synta.yaml
+  - ./base_text2mel.yaml
+lambda_mel_adv: 0.05
+disc_win_num: 3
+mel_disc_hidden_size: 128
+disc_norm: in
+disc_reduction: stack
+disc_interval: 1
+disc_lr: 0.0001
+disc_start_steps: 0
+discriminator_scheduler_params:
+  gamma: 0.5
+  step_size: 40000
+discriminator_optimizer_params:
+  eps: 1.0e-06
+  weight_decay: 0.0

egs/datasets/audio/lj/__pycache__/preprocess.cpython-36.pyc ADDED Viewed

Binary file (711 Bytes). View file

egs/datasets/audio/lj/base_mel2wav.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+base_config: egs/egs_bases/tts/vocoder/base.yaml
+raw_data_dir: 'data/raw/LJSpeech-1.1'
+processed_data_dir: 'data/processed/ljspeech'
+binary_data_dir: 'data/binary/ljspeech_wav'

egs/datasets/audio/lj/base_text2mel.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+ds_name: ljspeech
+base_config: egs/egs_bases/tts/base.yaml
+raw_data_dir: 'data/raw/LJSpeech-1.1'
+processed_data_dir: 'data/processed/ljspeech'
+binary_data_dir: 'data/binary/ljspeech'
+preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
+binarization_args:
+  train_range: [ 871, -1 ]
+  test_range: [ 0, 523 ]
+  valid_range: [ 523, 871 ]
+test_ids: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+            68, 70, 74, 87, 110, 172, 190, 215, 231, 294,
+            316, 324, 402, 422, 485, 500, 505, 508, 509, 519 ]
+f0_min: 80
+f0_max: 600
+vocoder_ckpt: checkpoints/hifi_lj

egs/datasets/audio/lj/ds.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+base_config:
+  - egs/egs_bases/tts/ds.yaml
+  - ./fs2_orig.yaml
+fs2_ckpt: checkpoints/aux_exp/model_ckpt_steps_100000.ckpt
+# spec_min and spec_max are calculated on the training set.
+spec_min: [ -4.7574, -4.6783, -4.6431, -4.5832, -4.5390, -4.6771, -4.8089, -4.7672,
+            -4.5784, -4.7755, -4.7150, -4.8919, -4.8271, -4.7389, -4.6047, -4.7759,
+            -4.6799, -4.8201, -4.7823, -4.8262, -4.7857, -4.7545, -4.9358, -4.9733,
+            -5.1134, -5.1395, -4.9016, -4.8434, -5.0189, -4.8460, -5.0529, -4.9510,
+            -5.0217, -5.0049, -5.1831, -5.1445, -5.1015, -5.0281, -4.9887, -4.9916,
+            -4.9785, -4.9071, -4.9488, -5.0342, -4.9332, -5.0650, -4.8924, -5.0875,
+            -5.0483, -5.0848, -5.0655, -5.0279, -5.0015, -5.0792, -5.0636, -5.2413,
+            -5.1421, -5.1710, -5.3256, -5.0511, -5.1186, -5.0057, -5.0446, -5.1173,
+            -5.0325, -5.1085, -5.0053, -5.0755, -5.1176, -5.1004, -5.2153, -5.2757,
+            -5.3025, -5.2867, -5.2918, -5.3328, -5.2731, -5.2985, -5.2400, -5.2211 ]
+spec_max: [ -0.5982, -0.0778,  0.1205,  0.2747,  0.4657,  0.5123,  0.5830,  0.7093,
+            0.6461,  0.6101,  0.7316,  0.7715,  0.7681,  0.8349,  0.7815,  0.7591,
+            0.7910,  0.7433,  0.7352,  0.6869,  0.6854,  0.6623,  0.5353,  0.6492,
+            0.6909,  0.6106,  0.5761,  0.5236,  0.5638,  0.4054,  0.4545,  0.3407,
+            0.3037,  0.3380,  0.1599,  0.1603,  0.2741,  0.2130,  0.1569,  0.1911,
+            0.2324,  0.1586,  0.1221,  0.0341, -0.0558,  0.0553, -0.1153, -0.0933,
+            -0.1171, -0.0050, -0.1519, -0.1629, -0.0522, -0.0739, -0.2069, -0.2405,
+            -0.1244, -0.2582, -0.1361, -0.1575, -0.1442,  0.0513, -0.1567, -0.2000,
+            0.0086, -0.0698,  0.1385,  0.0941,  0.1864,  0.1225,  0.1389,  0.1382,
+            0.1670,  0.1007,  0.1444,  0.0888,  0.1998,  0.2280,  0.2932,  0.3047 ]
+max_tokens: 30000

egs/datasets/audio/lj/fs.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/fs.yaml
+  - ./base_text2mel.yaml

egs/datasets/audio/lj/fs2_orig.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+base_config:
+  - egs/egs_bases/tts/fs2_orig.yaml
+  - ./base_text2mel.yaml
+binary_data_dir: 'data/binary/ljspeech_cwt'

egs/datasets/audio/lj/hifigan.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/vocoder/hifigan.yaml
+  - ./base_mel2wav.yaml

egs/datasets/audio/lj/preprocess.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from data_gen.tts.base_preprocess import BasePreprocessor
+class LJPreprocess(BasePreprocessor):
+    def meta_data(self):
+        for l in open(f'{self.raw_data_dir}/metadata.csv').readlines():
+            item_name, _, txt = l.strip().split("|")
+            wav_fn = f"{self.raw_data_dir}/wavs/{item_name}.wav"
+            yield {'item_name': item_name, 'wav_fn': wav_fn, 'txt': txt}

egs/datasets/audio/lj/ps_flow.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/ps_flow.yaml
+  - ./base_text2mel.yaml

egs/datasets/audio/lj/ps_flow_nips2021.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+base_config:
+  - ./ps_flow.yaml
+max_sentences: 64
+dur_level: word
+use_word_encoder: false
+enc_prenet: true
+enc_pre_ln: false
+fvae_encoder_type: wn
+fvae_decoder_type: wn
+text_encoder_postnet: false
+warmup_updates: 8000

egs/datasets/audio/lj/ps_flow_small.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+base_config:
+  - egs/egs_bases/tts/ps_flow_small.yaml
+  - ./base_text2mel.yaml

egs/datasets/audio/lj/ps_flow_small_nips2021.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+base_config:
+  - ./ps_flow_small.yaml
+max_sentences: 128
+dur_level: word
+use_word_encoder: false
+enc_prenet: true
+enc_pre_ln: false
+fvae_encoder_type: wn
+fvae_decoder_type: wn
+text_encoder_postnet: false
+warmup_updates: 8000

egs/datasets/audio/lj/synta.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+base_config:
+  - egs/egs_bases/tts/synta.yaml
+  - ./base_text2mel.yaml
+lambda_mel_adv: 0.05
+disc_win_num: 3
+mel_disc_hidden_size: 128
+disc_norm: in
+disc_reduction: stack
+disc_interval: 1
+disc_lr: 0.0001
+disc_start_steps: 0
+discriminator_scheduler_params:
+  gamma: 0.5
+  step_size: 40000
+discriminator_optimizer_params:
+  eps: 1.0e-06
+  weight_decay: 0.0

egs/egs_bases/config_base.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# task
+binary_data_dir: ''
+work_dir: '' # experiment directory.
+infer: false # infer
+amp: false
+seed: 1234
+debug: false
+save_codes: ['tasks', 'modules', 'egs']
+#############
+# dataset
+#############
+ds_workers: 1
+test_num: 100
+endless_ds: true
+sort_by_len: true
+#########
+# train and eval
+#########
+print_nan_grads: false
+load_ckpt: ''
+save_best: false
+num_ckpt_keep: 3
+clip_grad_norm: 0
+accumulate_grad_batches: 1
+tb_log_interval: 100
+num_sanity_val_steps: 5  # steps of validation at the beginning
+check_val_every_n_epoch: 10
+val_check_interval: 2000
+valid_monitor_key: 'val_loss'
+valid_monitor_mode: 'min'
+max_epochs: 1000
+max_updates: 1000000
+max_tokens: 40000
+max_sentences: 100000
+max_valid_tokens: -1
+max_valid_sentences: -1
+eval_max_batches: -1
+resume_from_checkpoint: 0
+rename_tmux: true

egs/egs_bases/tts/base.yaml ADDED Viewed

	@@ -0,0 +1,56 @@

+# task
+base_config:
+  - ../config_base.yaml
+  - ./dataset_params.yaml
+#############
+# dataset in training
+#############
+endless_ds: true
+min_frames: 0
+max_frames: 1548
+frames_multiple: 1
+max_input_tokens: 1550
+ds_workers: 1
+#########
+# model
+#########
+use_spk_id: false
+use_spk_embed: false
+mel_losses: "ssim:0.5|l1:0.5"
+###########
+# optimization
+###########
+lr: 0.0005
+scheduler: warmup # rsqrt|warmup|none
+warmup_updates: 4000
+optimizer_adam_beta1: 0.9
+optimizer_adam_beta2: 0.98
+weight_decay: 0
+clip_grad_norm: 1
+clip_grad_value: 0
+###########
+# train and eval
+###########
+use_word_input: false
+max_valid_sentences: 1
+max_valid_tokens: 60000
+valid_infer_interval: 10000
+train_set_name: 'train'
+train_sets: ''
+valid_set_name: 'valid'
+test_set_name: 'test'
+num_valid_plots: 10
+test_ids: [ ]
+test_input_yaml: ''
+vocoder: HifiGAN
+vocoder_ckpt: ''
+profile_infer: false
+out_wav_norm: false
+save_gt: true
+save_f0: false
+gen_dir_name: ''

egs/egs_bases/tts/base_zh.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+base_config: ./base.yaml
+preprocess_args:
+  txt_processor: zh
+word_size: 3000

egs/egs_bases/tts/dataset_params.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+audio_num_mel_bins: 80
+audio_sample_rate: 22050
+hop_size: 256  # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate)
+win_size: 1024  # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate)
+fft_size: 1024  # Extra window size is filled with 0 paddings to match this parameter
+fmin: 80  # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
+fmax: 7600  # To be increased/reduced depending on data.
+f0_min: 80
+f0_max: 800
+griffin_lim_iters: 30
+pitch_extractor: parselmouth
+num_spk: 1
+mel_vmin: -6
+mel_vmax: 1.5
+loud_norm: false
+raw_data_dir: ''
+processed_data_dir: ''
+binary_data_dir: ''
+preprocess_cls: ''
+binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
+preprocess_args:
+  nsample_per_mfa_group: 1000
+  # text process
+  txt_processor: en
+  use_mfa: true
+  with_phsep: true
+  reset_phone_dict: true
+  reset_word_dict: true
+  add_eos_bos: true
+  # mfa
+  mfa_group_shuffle: false
+  mfa_offset: 0.02
+  # wav processors
+  wav_processors: [ ]
+  save_sil_mask: true
+  vad_max_silence_length: 12
+binarization_args:
+  shuffle: false
+  with_wav: false
+  with_align: true
+  with_spk_embed: false
+  with_f0: true
+  with_f0cwt: false
+  with_linear: false
+  trim_eos_bos: false
+  min_sil_duration: 0.1
+  train_range: [ 200, -1 ]
+  test_range: [ 0, 100 ]
+  valid_range: [ 100, 200 ]
+word_dict_size: 10000
+pitch_key: pitch

egs/egs_bases/tts/ds.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+base_config: ./fs2_orig.yaml
+# special configs for diffspeech
+task_cls: tasks.tts.diffspeech.DiffSpeechTask
+lr: 0.001
+timesteps: 100
+K_step: 71
+diff_loss_type: l1
+diff_decoder_type: 'wavenet'
+schedule_type: 'linear'
+max_beta: 0.06
+## model configs for diffspeech
+dilation_cycle_length: 1
+residual_layers: 20
+residual_channels: 256
+decay_steps: 50000
+keep_bins: 80
+#content_cond_steps: [ ] # [ 0, 10000 ]
+#spk_cond_steps: [ ] # [ 0, 10000 ]
+#gen_tgt_spk_id: -1
+# training configs for diffspeech
+#max_sentences: 48
+#num_sanity_val_steps: 1
+num_valid_plots: 10
+use_gt_dur: false
+use_gt_f0: false
+use_energy_embed: false
+#pitch_type: cwt
+max_updates: 160000

egs/egs_bases/tts/fs.yaml ADDED Viewed

	@@ -0,0 +1,75 @@

+base_config: ./base.yaml
+task_cls: tasks.tts.fs.FastSpeechTask
+# model
+hidden_size: 256
+dropout: 0.0
+encoder_type: rel_fft # rel_fft|fft|tacotron|tacotron2|conformer
+decoder_type: conv # fft|rnn|conv|conformer|wn
+# rnn enc/dec
+encoder_K: 8
+decoder_rnn_dim: 0 # for rnn decoder, 0 -> hidden_size * 2
+# fft enc/dec
+enc_layers: 4
+enc_ffn_kernel_size: 9
+enc_prenet: true
+enc_pre_ln: true
+dec_layers: 4
+dec_ffn_kernel_size: 9
+num_heads: 2
+ffn_act: gelu
+ffn_hidden_size: 1024
+use_pos_embed: true
+# conv enc/dec
+enc_dec_norm: ln
+conv_use_pos: false
+layers_in_block: 2
+enc_dilations: [ 1, 1, 1, 1 ]
+enc_kernel_size: 5
+enc_post_net_kernel: 3
+dec_dilations: [ 1, 1, 1, 1 ] # for conv decoder
+dec_kernel_size: 5
+dec_post_net_kernel: 3
+# duration
+predictor_hidden: -1
+dur_predictor_kernel: 3
+dur_predictor_layers: 2
+predictor_kernel: 5
+predictor_layers: 5
+predictor_dropout: 0.5
+# pitch and energy
+use_pitch_embed: false
+pitch_type: frame # frame|ph|cwt
+use_uv: true
+# reference encoder and speaker embedding
+lambda_commit: 0.25
+ref_norm_layer: bn
+dec_inp_add_noise: false
+# mel
+mel_losses: l1:0.5|ssim:0.5 # l1|l2|gdl|ssim or l1:0.5|ssim:0.5
+# loss lambda
+lambda_f0: 1.0
+lambda_uv: 1.0
+lambda_energy: 0.1
+lambda_ph_dur: 0.1
+lambda_sent_dur: 1.0
+lambda_word_dur: 1.0
+predictor_grad: 0.1
+# train and eval
+warmup_updates: 4000
+max_tokens: 40000
+max_sentences: 128
+max_valid_sentences: 1
+max_updates: 160000
+use_gt_dur: false
+use_gt_f0: false
+ds_workers: 2

egs/egs_bases/tts/fs2_orig.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+base_config: ./fs.yaml
+task_cls: tasks.tts.fs2_orig.FastSpeech2OrigTask
+encoder_type: fft
+decoder_type: fft
+use_energy_embed: false
+use_pitch_embed: true
+pitch_type: cwt # frame|ph|cwt
+binarization_args:
+  with_f0cwt: true
+use_gt_energy: false
+cwt_std_scale: 0.8
+dropout: 0.1
+mel_losses: l1

egs/egs_bases/tts/ps.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+base_config: ./fs.yaml
+###########################
+# models
+###########################
+# encoders
+hidden_size: 192
+ffn_hidden_size: 768
+enc_ffn_kernel_size: 5
+enc_layers: 4
+dur_level: word
+encoder_type: rel_fft
+use_word_encoder: true
+# mix ling encoder
+word_enc_layers: 4
+word_encoder_type: rel_fft
+use_pitch_embed: false
+enc_prenet: true
+enc_pre_ln: true
+text_encoder_postnet: true
+dropout: 0.0
+add_word_pos: true
+# dur predictor
+dur_predictor_layers: 3
+dur_predictor_kernel: 5
+predictor_dropout: 0.2
+## fvae
+use_fvae: true
+latent_size: 16
+fvae_encoder_type: conv
+fvae_decoder_type: conv
+fvae_enc_dec_hidden: 192
+fvae_kernel_size: 5
+fvae_enc_n_layers: 8
+fvae_dec_n_layers: 4
+fvae_strides: 4
+fvae_noise_scale: 1.0
+# prior flow
+use_prior_flow: true
+prior_flow_hidden: 64
+prior_flow_kernel_size: 3
+prior_flow_n_blocks: 4
+###########################
+# training and inference
+###########################
+lambda_kl: 1.0
+kl_min: 0.0
+lambda_sent_dur: 0.0
+kl_start_steps: 10000
+posterior_start_steps: 0
+frames_multiple: 4
+num_valid_plots: 10
+lr: 0.0002
+warmup_updates: 8000
+max_tokens: 40000
+valid_infer_interval: 10000
+max_sentences: 80
+max_updates: 480000

egs/egs_bases/tts/ps_flow.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+base_config: ./ps.yaml
+task_cls: tasks.tts.ps_flow.PortaSpeechFlowTask
+use_post_flow: true
+detach_postflow_input: true
+post_flow_lr: 0.001
+post_glow_hidden: 192
+post_glow_kernel_size: 3
+post_glow_n_blocks: 12
+post_glow_n_block_layers: 3
+post_share_cond_layers: false
+share_wn_layers: 4
+use_cond_proj: false
+use_latent_cond: false
+use_txt_cond: true
+sigmoid_scale: false
+post_glow_training_start: 160000
+noise_scale: 0.8
+infer_post_glow: true
+two_stage: true

egs/egs_bases/tts/ps_flow_small.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+base_config: ./ps_flow.yaml
+###########################
+# models
+###########################
+# encoders
+hidden_size: 128
+ffn_hidden_size: 512
+enc_ffn_kernel_size: 3
+enc_layers: 3
+word_enc_layers: 3
+# dur predictor
+dur_predictor_layers: 3
+dur_predictor_kernel: 5
+predictor_dropout: 0.2
+## fvae
+use_fvae: true
+latent_size: 16
+fvae_encoder_type: wn
+fvae_decoder_type: wn
+fvae_enc_dec_hidden: 128
+fvae_kernel_size: 3
+fvae_enc_n_layers: 8
+fvae_dec_n_layers: 3
+fvae_strides: 4
+fvae_noise_scale: 1.0
+# prior flow
+use_prior_flow: true
+prior_flow_hidden: 32
+prior_flow_kernel_size: 3
+prior_flow_n_blocks: 3
+# post flow
+post_glow_hidden: 128
+post_glow_kernel_size: 3
+post_glow_n_blocks: 8
+post_glow_n_block_layers: 3
+share_wn_layers: 4
+noise_scale: 0.6

egs/egs_bases/tts/synta.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+base_config: ./ps.yaml
+task_cls: tasks.tts.synta.SyntaSpeechTask
+use_post_flow: true
+detach_postflow_input: true
+post_flow_lr: 0.001
+post_glow_hidden: 192
+post_glow_kernel_size: 3
+post_glow_n_blocks: 12
+post_glow_n_block_layers: 3
+post_share_cond_layers: false
+share_wn_layers: 4
+use_cond_proj: false
+use_latent_cond: false
+use_txt_cond: true
+sigmoid_scale: false
+post_glow_training_start: 160000
+noise_scale: 0.8
+infer_post_glow: true
+two_stage: true

egs/egs_bases/tts/vocoder/base.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+base_config:
+  - egs/egs_bases/config_base.yaml
+  - ../dataset_params.yaml
+binarization_args:
+  with_wav: true
+  with_spk_embed: false
+  with_align: false
+generator_grad_norm: 10.0    # Generator's gradient norm.
+discriminator_grad_norm: 1.0 # Discriminator's gradient norm.
+###########
+# train and eval
+###########
+max_samples: 20480
+max_sentences: 8
+max_valid_sentences: 1
+max_updates: 2000000
+val_check_interval: 5000
+valid_infer_interval: 50000

egs/egs_bases/tts/vocoder/hifigan.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+base_config: ./base.yaml
+task_cls: tasks.vocoder.hifigan.HifiGanTask
+resblock: "1"
+adam_b1: 0.8
+adam_b2: 0.99
+upsample_rates: [ 8,8,2,2 ]
+upsample_kernel_sizes: [ 16,16,4,4 ]
+upsample_initial_channel: 512
+resblock_kernel_sizes: [ 3,7,11 ]
+resblock_dilation_sizes: [ [ 1,3,5 ], [ 1,3,5 ], [ 1,3,5 ] ]
+use_pitch_embed: false
+use_fm_loss: false
+use_ms_stft: false
+lambda_mel: 5.0
+lambda_mel_adv: 1.0
+lambda_cdisc: 4.0
+lambda_adv: 1.0
+lr: 0.0002            # Generator's learning rate.
+generator_scheduler_params:
+  step_size: 600
+  gamma: 0.999
+discriminator_scheduler_params:
+  step_size: 600
+  gamma: 0.999
+max_updates: 3000000

inference/tts/__pycache__/base_tts_infer.cpython-36.pyc ADDED Viewed

Binary file (4.19 kB). View file

inference/tts/base_tts_infer.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import os
+import torch
+from modules.vocoder.hifigan.hifigan import HifiGanGenerator
+from tasks.tts.dataset_utils import FastSpeechWordDataset
+from tasks.tts.tts_utils import load_data_preprocessor
+from utils.commons.ckpt_utils import load_ckpt
+from utils.commons.hparams import set_hparams
+class BaseTTSInfer:
+    def __init__(self, hparams, device=None):
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.hparams = hparams
+        self.device = device
+        self.data_dir = hparams['binary_data_dir']
+        self.preprocessor, self.preprocess_args = load_data_preprocessor()
+        self.ph_encoder, self.word_encoder = self.preprocessor.load_dict(self.data_dir)
+        self.spk_map = self.preprocessor.load_spk_map(self.data_dir)
+        self.ds_cls = FastSpeechWordDataset
+        self.model = self.build_model()
+        self.model.eval()
+        self.model.to(self.device)
+        self.vocoder = self.build_vocoder()
+        self.vocoder.eval()
+        self.vocoder.to(self.device)
+    def build_model(self):
+        raise NotImplementedError
+    def forward_model(self, inp):
+        raise NotImplementedError
+    def build_vocoder(self):
+        base_dir = self.hparams['vocoder_ckpt']
+        config_path = f'{base_dir}/config.yaml'
+        config = set_hparams(config_path, global_hparams=False)
+        vocoder = HifiGanGenerator(config)
+        load_ckpt(vocoder, base_dir, 'model_gen')
+        return vocoder
+    def run_vocoder(self, c):
+        c = c.transpose(2, 1)
+        y = self.vocoder(c)[:, 0]
+        return y
+    def preprocess_input(self, inp):
+        """
+        :param inp: {'text': str, 'item_name': (str, optional), 'spk_name': (str, optional)}
+        :return:
+        """
+        preprocessor, preprocess_args = self.preprocessor, self.preprocess_args
+        text_raw = inp['text']
+        item_name = inp.get('item_name', '<ITEM_NAME>')
+        spk_name = inp.get('spk_name', '<SINGLE_SPK>')
+        ph, txt, word, ph2word, ph_gb_word = preprocessor.txt_to_ph(
+            preprocessor.txt_processor, text_raw, preprocess_args)
+        word_token = self.word_encoder.encode(word)
+        ph_token = self.ph_encoder.encode(ph)
+        spk_id = self.spk_map[spk_name]
+        item = {'item_name': item_name, 'text': txt, 'ph': ph, 'spk_id': spk_id,
+                'ph_token': ph_token, 'word_token': word_token, 'ph2word': ph2word,
+                'ph_words':ph_gb_word, 'words': word}
+        item['ph_len'] = len(item['ph_token'])
+        return item
+    def input_to_batch(self, item):
+        item_names = [item['item_name']]
+        text = [item['text']]
+        ph = [item['ph']]
+        txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
+        txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
+        word_tokens = torch.LongTensor(item['word_token'])[None, :].to(self.device)
+        word_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
+        ph2word = torch.LongTensor(item['ph2word'])[None, :].to(self.device)
+        spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)
+        batch = {
+            'item_name': item_names,
+            'text': text,
+            'ph': ph,
+            'txt_tokens': txt_tokens,
+            'txt_lengths': txt_lengths,
+            'word_tokens': word_tokens,
+            'word_lengths': word_lengths,
+            'ph2word': ph2word,
+            'spk_ids': spk_ids,
+        }
+        return batch
+    def postprocess_output(self, output):
+        return output
+    def infer_once(self, inp):
+        inp = self.preprocess_input(inp)
+        output = self.forward_model(inp)
+        output = self.postprocess_output(output)
+        return output
+    @classmethod
+    def example_run(cls):
+        from utils.commons.hparams import set_hparams
+        from utils.commons.hparams import hparams as hp
+        from utils.audio.io import save_wav
+        set_hparams()
+        if hp['ds_name'] in ['lj', 'libritts']:
+            inp = {
+                'text': 'the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.'
+            }
+        elif hp['ds_name'] in ['biaobei']:
+            inp = {
+                'text': '如果我想你三遍，天上乌云就散一片。'
+            }
+        infer_ins = cls(hp)
+        out = infer_ins.infer_once(inp)
+        os.makedirs('infer_out', exist_ok=True)
+        save_wav(out, f'infer_out/example_out.wav', hp['audio_sample_rate'])

inference/tts/ds.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+# from inference.tts.fs import FastSpeechInfer
+# from modules.tts.fs2_orig import FastSpeech2Orig
+from inference.tts.base_tts_infer import BaseTTSInfer
+from modules.tts.diffspeech.shallow_diffusion_tts import GaussianDiffusion
+from utils.commons.ckpt_utils import load_ckpt
+from utils.commons.hparams import hparams
+class DiffSpeechInfer(BaseTTSInfer):
+    def build_model(self):
+        dict_size = len(self.ph_encoder)
+        model = GaussianDiffusion(dict_size, self.hparams)
+        model.eval()
+        load_ckpt(model, hparams['work_dir'], 'model')
+        return model
+    def forward_model(self, inp):
+        sample = self.input_to_batch(inp)
+        txt_tokens = sample['txt_tokens']  # [B, T_t]
+        spk_id = sample.get('spk_ids')
+        with torch.no_grad():
+            output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True)
+            mel_out = output['mel_out']
+            wav_out = self.run_vocoder(mel_out)
+        wav_out = wav_out.cpu().numpy()
+        return wav_out[0]
+if __name__ == '__main__':
+    DiffSpeechInfer.example_run()

inference/tts/fs.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+from inference.tts.base_tts_infer import BaseTTSInfer
+from modules.tts.fs import FastSpeech
+from utils.commons.ckpt_utils import load_ckpt
+from utils.commons.hparams import hparams
+class FastSpeechInfer(BaseTTSInfer):
+    def build_model(self):
+        dict_size = len(self.ph_encoder)
+        model = FastSpeech(dict_size, self.hparams)
+        model.eval()
+        load_ckpt(model, hparams['work_dir'], 'model')
+        return model
+    def forward_model(self, inp):
+        sample = self.input_to_batch(inp)
+        txt_tokens = sample['txt_tokens']  # [B, T_t]
+        spk_id = sample.get('spk_ids')
+        with torch.no_grad():
+            output = self.model(txt_tokens, spk_id=spk_id, infer=True)
+            mel_out = output['mel_out']
+            wav_out = self.run_vocoder(mel_out)
+        wav_out = wav_out.cpu().numpy()
+        return wav_out[0]
+if __name__ == '__main__':
+    FastSpeechInfer.example_run()

inference/tts/fs2_orig.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from inference.tts.fs import FastSpeechInfer
+from modules.tts.fs2_orig import FastSpeech2Orig
+from utils.commons.ckpt_utils import load_ckpt
+from utils.commons.hparams import hparams
+class FastSpeech2OrigInfer(FastSpeechInfer):
+    def build_model(self):
+        dict_size = len(self.ph_encoder)
+        model = FastSpeech2Orig(dict_size, self.hparams)
+        model.eval()
+        load_ckpt(model, hparams['work_dir'], 'model')
+        return model
+if __name__ == '__main__':
+    FastSpeech2OrigInfer.example_run()

inference/tts/gradio/gradio_settings.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+title: 'yerfor/SyntaSpeech'
+description: |
+  Gradio demo for yerfor/SyntaSpeech. To use it, simply add your audio, or click one of the examples to load them. Note: This space is running on CPU, inference times will be higher.
+article: |
+  Link to <a href='https://github.com/yerfor/SyntaSpeech' style='color:blue;' target='_blank\'>Github REPO</a>
+example_inputs:
+  - |-
+    the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.
+  - |-
+    produced the block books, which were the immediate predecessors of the true printed book,
+inference_cls: inference.tts.synta.SyntaSpeechInfer
+exp_name: lj_synta

inference/tts/gradio/infer.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import importlib
+import re
+import gradio as gr
+import yaml
+from gradio.inputs import Textbox
+from inference.tts.base_tts_infer import BaseTTSInfer
+from utils.commons.hparams import set_hparams
+from utils.commons.hparams import hparams as hp
+import numpy as np
+from utils.text.text_encoder import PUNCS
+class GradioInfer:
+    def __init__(self, exp_name, inference_cls, title, description, article, example_inputs):
+        self.exp_name = exp_name
+        self.title = title
+        self.description = description
+        self.article = article
+        self.example_inputs = example_inputs
+        pkg = ".".join(inference_cls.split(".")[:-1])
+        cls_name = inference_cls.split(".")[-1]
+        self.inference_cls = getattr(importlib.import_module(pkg), cls_name)
+    def greet(self, text):
+        sents = re.split(rf'([{PUNCS}])', text.replace('\n', ','))
+        if sents[-1] not in list(PUNCS):
+            sents = sents + ['.']
+        audio_outs = []
+        s = ""
+        for i in range(0, len(sents), 2):
+            if len(sents[i]) > 0:
+                s += sents[i] + sents[i + 1]
+            if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0):
+                audio_out = self.infer_ins.infer_once({
+                    'text': s
+                })
+                audio_out = audio_out * 32767
+                audio_out = audio_out.astype(np.int16)
+                audio_outs.append(audio_out)
+                audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16))
+                s = ""
+        audio_outs = np.concatenate(audio_outs)
+        return hp['audio_sample_rate'], audio_outs
+    def run(self):
+        set_hparams(exp_name=self.exp_name)
+        infer_cls = self.inference_cls
+        self.infer_ins: BaseTTSInfer = infer_cls(hp)
+        example_inputs = self.example_inputs
+        iface = gr.Interface(fn=self.greet,
+                             inputs=Textbox(
+                                 lines=10, placeholder=None, default=example_inputs[0], label="input text"),
+                             outputs="audio",
+                             allow_flagging="never",
+                             title=self.title,
+                             description=self.description,
+                             article=self.article,
+                             examples=example_inputs,
+                             enable_queue=True)
+        iface.launch(share=True,cache_examples=True)
+if __name__ == '__main__':
+    gradio_config = yaml.safe_load(open('inference/tts/gradio/gradio_settings.yaml'))
+    g = GradioInfer(**gradio_config)
+    g.run()

inference/tts/ps_flow.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+from inference.tts.base_tts_infer import BaseTTSInfer
+from modules.tts.portaspeech.portaspeech_flow import PortaSpeechFlow
+from utils.commons.ckpt_utils import load_ckpt
+from utils.commons.hparams import hparams
+class PortaSpeechFlowInfer(BaseTTSInfer):
+    def build_model(self):
+        ph_dict_size = len(self.ph_encoder)
+        word_dict_size = len(self.word_encoder)
+        model = PortaSpeechFlow(ph_dict_size, word_dict_size, self.hparams)
+        load_ckpt(model, hparams['work_dir'], 'model')
+        model.to(self.device)
+        with torch.no_grad():
+            model.store_inverse_all()
+        model.eval()
+        return model
+    def forward_model(self, inp):
+        sample = self.input_to_batch(inp)
+        with torch.no_grad():
+            output = self.model(
+                sample['txt_tokens'],
+                sample['word_tokens'],
+                ph2word=sample['ph2word'],
+                word_len=sample['word_lengths'].max(),
+                infer=True,
+                forward_post_glow=True,
+                spk_id=sample.get('spk_ids')
+            )
+            mel_out = output['mel_out']
+            wav_out = self.run_vocoder(mel_out)
+        wav_out = wav_out.cpu().numpy()
+        return wav_out[0]
+if __name__ == '__main__':
+    PortaSpeechFlowInfer.example_run()

inference/tts/synta.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+from inference.tts.base_tts_infer import BaseTTSInfer
+from modules.tts.syntaspeech.syntaspeech import SyntaSpeech
+from utils.commons.ckpt_utils import load_ckpt
+from utils.commons.hparams import hparams
+from modules.tts.syntaspeech.syntactic_graph_buider import Sentence2GraphParser
+class SyntaSpeechInfer(BaseTTSInfer):
+    def __init__(self, hparams, device=None):
+        super().__init__(hparams, device)
+        if hparams['ds_name'] in ['biaobei']:
+            self.syntactic_graph_builder = Sentence2GraphParser(language='zh')
+        elif hparams['ds_name'] in ['ljspeech', 'libritts']:
+            self.syntactic_graph_builder = Sentence2GraphParser(language='en')
+    def build_model(self):
+        ph_dict_size = len(self.ph_encoder)
+        word_dict_size = len(self.word_encoder)
+        model = SyntaSpeech(ph_dict_size, word_dict_size, self.hparams)
+        load_ckpt(model, hparams['work_dir'], 'model')
+        model.to(self.device)
+        with torch.no_grad():
+            model.store_inverse_all()
+        model.eval()
+        return model
+    def input_to_batch(self, item):
+        item_names = [item['item_name']]
+        text = [item['text']]
+        ph = [item['ph']]
+        txt_tokens = torch.LongTensor(item['ph_token'])[None, :].to(self.device)
+        txt_lengths = torch.LongTensor([txt_tokens.shape[1]]).to(self.device)
+        word_tokens = torch.LongTensor(item['word_token'])[None, :].to(self.device)
+        word_lengths = torch.LongTensor([word_tokens.shape[1]]).to(self.device)
+        ph2word = torch.LongTensor(item['ph2word'])[None, :].to(self.device)
+        spk_ids = torch.LongTensor(item['spk_id'])[None, :].to(self.device)
+        dgl_graph, etypes = self.syntactic_graph_builder.parse(item['text'], words=item['words'].split(" "), ph_words=item['ph_words'].split(" "))
+        dgl_graph = dgl_graph.to(self.device)
+        etypes = etypes.to(self.device)
+        batch = {
+            'item_name': item_names,
+            'text': text,
+            'ph': ph,
+            'txt_tokens': txt_tokens,
+            'txt_lengths': txt_lengths,
+            'word_tokens': word_tokens,
+            'word_lengths': word_lengths,
+            'ph2word': ph2word,
+            'spk_ids': spk_ids,
+            'graph_lst': [dgl_graph],
+            'etypes_lst': [etypes]
+        }
+        return batch
+    def forward_model(self, inp):
+        sample = self.input_to_batch(inp)
+        with torch.no_grad():
+            output = self.model(
+                sample['txt_tokens'],
+                sample['word_tokens'],
+                ph2word=sample['ph2word'],
+                word_len=sample['word_lengths'].max(),
+                infer=True,
+                forward_post_glow=True,
+                spk_id=sample.get('spk_ids'),
+                graph_lst=sample['graph_lst'],
+                etypes_lst=sample['etypes_lst']
+            )
+            mel_out = output['mel_out']
+            wav_out = self.run_vocoder(mel_out)
+        wav_out = wav_out.cpu().numpy()
+        return wav_out[0]
+if __name__ == '__main__':
+    SyntaSpeechInfer.example_run()

modules/commons/__pycache__/conv.cpython-36.pyc ADDED Viewed

Binary file (6.54 kB). View file