AbeShinzo0708 commited on
Commit
49193db
1 Parent(s): 1fbd54a

Upload 12 files

Browse files
100epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b393b091ed1da1ef2bd4ec1d067aecc94ed7a19b4c00d4ea16d0d3ff4d233471
3
+ size 373275392
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: AI SugaYoshihide Speaker
3
- emoji: 🌍
4
- colorFrom: purple
5
- colorTo: red
6
  sdk: streamlit
7
- sdk_version: 1.27.1
8
  app_file: app.py
9
  pinned: false
10
  license: openrail
 
1
  ---
2
+ title: AbeShinzo TTS
3
+ emoji: 🔥
4
+ colorFrom: indigo
5
+ colorTo: gray
6
  sdk: streamlit
7
+ sdk_version: 1.27.0
8
  app_file: app.py
9
  pinned: false
10
  license: openrail
abe.jpg ADDED
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import streamlit as st
4
+ import numpy as np
5
+ import torch
6
+ from espnet2.bin.tts_inference import Text2Speech
7
+ from scipy.io.wavfile import write
8
+ from PIL import Image
9
+
10
+
11
+ fs, lang = 44100, "Japanese"
12
+ model= "./100epoch.pth"
13
+ x = "これはテストメッセージです"
14
+
15
+ text2speech = Text2Speech.from_pretrained(
16
+ model_file=model,
17
+ device="cpu",
18
+ speed_control_alpha=1.0,
19
+ noise_scale=0.333,
20
+ noise_scale_dur=0.333,
21
+ )
22
+ pause = np.zeros(30000, dtype=np.float32)
23
+
24
+ st.title("おしゃべりAI菅義偉メーカー")
25
+ image = Image.open('suga.jpg')
26
+ st.image(image)
27
+ text = st.text_area(label='ここにテキストを入力 (Input Text)↓', height=100, max_chars=2048)
28
+
29
+
30
+ if st.button("生成(Generate)"):
31
+ with torch.no_grad():
32
+ wav = text2speech(text)["wav"]
33
+
34
+ wav_list = []
35
+ wav_list.append(np.concatenate([wav.view(-1).cpu().numpy(), pause]))
36
+ final_wav = np.concatenate(wav_list)
37
+ st.audio(final_wav, sample_rate=fs)
config.yaml ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/finetune_full_band_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_full_band_vits
7
+ ngpu: 1
8
+ seed: 777
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - total_count
39
+ - max
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: -1
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: 50
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param:
63
+ - downloads/full_band_vits_accent_with_pause_pretrain/exp/tts_train_full_band_vits_raw_phn_jaconv_pyopenjtalk_accent_with_pause/train.total_count.ave_10best.pth:tts:tts
64
+ ignore_init_mismatch: false
65
+ freeze_param: []
66
+ num_iters_per_epoch: 1000
67
+ batch_size: 20
68
+ valid_batch_size: null
69
+ batch_bins: 100000
70
+ valid_batch_bins: null
71
+ train_shape_file:
72
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/text_shape.phn
73
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/train/speech_shape
74
+ valid_shape_file:
75
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/text_shape.phn
76
+ - exp/tts_stats_raw_linear_spectrogram_phn_jaconv_pyopenjtalk_accent_with_pause/valid/speech_shape
77
+ batch_type: numel
78
+ valid_batch_type: null
79
+ fold_length:
80
+ - 150
81
+ - 409600
82
+ sort_in_batch: descending
83
+ sort_batch: descending
84
+ multiple_iterator: false
85
+ chunk_length: 500
86
+ chunk_shift_ratio: 0.5
87
+ num_cache_chunks: 1024
88
+ chunk_excluded_key_prefixes: []
89
+ train_data_path_and_name_and_type:
90
+ - - dump/44k/raw/tr_no_dev/text
91
+ - text
92
+ - text
93
+ - - dump/44k/raw/tr_no_dev/wav.scp
94
+ - speech
95
+ - sound
96
+ valid_data_path_and_name_and_type:
97
+ - - dump/44k/raw/dev/text
98
+ - text
99
+ - text
100
+ - - dump/44k/raw/dev/wav.scp
101
+ - speech
102
+ - sound
103
+ allow_variable_data_keys: false
104
+ max_cache_size: 0.0
105
+ max_cache_fd: 32
106
+ valid_max_cache_size: null
107
+ exclude_weight_decay: false
108
+ exclude_weight_decay_conf: {}
109
+ optim: adamw
110
+ optim_conf:
111
+ lr: 0.0001
112
+ betas:
113
+ - 0.8
114
+ - 0.99
115
+ eps: 1.0e-09
116
+ weight_decay: 0.0
117
+ scheduler: exponentiallr
118
+ scheduler_conf:
119
+ gamma: 0.999875
120
+ optim2: adamw
121
+ optim2_conf:
122
+ lr: 0.0001
123
+ betas:
124
+ - 0.8
125
+ - 0.99
126
+ eps: 1.0e-09
127
+ weight_decay: 0.0
128
+ scheduler2: exponentiallr
129
+ scheduler2_conf:
130
+ gamma: 0.999875
131
+ generator_first: false
132
+ token_list:
133
+ - <blank>
134
+ - <unk>
135
+ - '1'
136
+ - '2'
137
+ - '0'
138
+ - '3'
139
+ - '4'
140
+ - '-1'
141
+ - '5'
142
+ - a
143
+ - o
144
+ - '-2'
145
+ - i
146
+ - '-3'
147
+ - u
148
+ - e
149
+ - k
150
+ - n
151
+ - t
152
+ - '6'
153
+ - r
154
+ - '-4'
155
+ - s
156
+ - N
157
+ - m
158
+ - pau
159
+ - '7'
160
+ - sh
161
+ - d
162
+ - g
163
+ - w
164
+ - '8'
165
+ - U
166
+ - '-5'
167
+ - I
168
+ - cl
169
+ - h
170
+ - y
171
+ - b
172
+ - '9'
173
+ - j
174
+ - ts
175
+ - ch
176
+ - '-6'
177
+ - z
178
+ - p
179
+ - '-7'
180
+ - f
181
+ - ky
182
+ - ry
183
+ - '-8'
184
+ - gy
185
+ - '-9'
186
+ - hy
187
+ - ny
188
+ - '-10'
189
+ - by
190
+ - my
191
+ - '-11'
192
+ - '-12'
193
+ - '-13'
194
+ - py
195
+ - '-14'
196
+ - '-15'
197
+ - v
198
+ - '10'
199
+ - '-16'
200
+ - '-17'
201
+ - '11'
202
+ - '-21'
203
+ - '-20'
204
+ - '12'
205
+ - '-19'
206
+ - '13'
207
+ - '-18'
208
+ - '14'
209
+ - dy
210
+ - '15'
211
+ - ty
212
+ - '-22'
213
+ - '16'
214
+ - '18'
215
+ - '19'
216
+ - '17'
217
+ - <sos/eos>
218
+ odim: null
219
+ model_conf: {}
220
+ use_preprocessor: true
221
+ token_type: phn
222
+ bpemodel: null
223
+ non_linguistic_symbols: null
224
+ cleaner: jaconv
225
+ g2p: pyopenjtalk_accent_with_pause
226
+ feats_extract: linear_spectrogram
227
+ feats_extract_conf:
228
+ n_fft: 2048
229
+ hop_length: 512
230
+ win_length: null
231
+ normalize: null
232
+ normalize_conf: {}
233
+ tts: vits
234
+ tts_conf:
235
+ generator_type: vits_generator
236
+ generator_params:
237
+ hidden_channels: 192
238
+ spks: -1
239
+ global_channels: -1
240
+ segment_size: 32
241
+ text_encoder_attention_heads: 2
242
+ text_encoder_ffn_expand: 4
243
+ text_encoder_blocks: 6
244
+ text_encoder_positionwise_layer_type: conv1d
245
+ text_encoder_positionwise_conv_kernel_size: 3
246
+ text_encoder_positional_encoding_layer_type: rel_pos
247
+ text_encoder_self_attention_layer_type: rel_selfattn
248
+ text_encoder_activation_type: swish
249
+ text_encoder_normalize_before: true
250
+ text_encoder_dropout_rate: 0.1
251
+ text_encoder_positional_dropout_rate: 0.0
252
+ text_encoder_attention_dropout_rate: 0.1
253
+ use_macaron_style_in_text_encoder: true
254
+ use_conformer_conv_in_text_encoder: false
255
+ text_encoder_conformer_kernel_size: -1
256
+ decoder_kernel_size: 7
257
+ decoder_channels: 512
258
+ decoder_upsample_scales:
259
+ - 8
260
+ - 8
261
+ - 2
262
+ - 2
263
+ - 2
264
+ decoder_upsample_kernel_sizes:
265
+ - 16
266
+ - 16
267
+ - 4
268
+ - 4
269
+ - 4
270
+ decoder_resblock_kernel_sizes:
271
+ - 3
272
+ - 7
273
+ - 11
274
+ decoder_resblock_dilations:
275
+ - - 1
276
+ - 3
277
+ - 5
278
+ - - 1
279
+ - 3
280
+ - 5
281
+ - - 1
282
+ - 3
283
+ - 5
284
+ use_weight_norm_in_decoder: true
285
+ posterior_encoder_kernel_size: 5
286
+ posterior_encoder_layers: 16
287
+ posterior_encoder_stacks: 1
288
+ posterior_encoder_base_dilation: 1
289
+ posterior_encoder_dropout_rate: 0.0
290
+ use_weight_norm_in_posterior_encoder: true
291
+ flow_flows: 4
292
+ flow_kernel_size: 5
293
+ flow_base_dilation: 1
294
+ flow_layers: 4
295
+ flow_dropout_rate: 0.0
296
+ use_weight_norm_in_flow: true
297
+ use_only_mean_in_flow: true
298
+ stochastic_duration_predictor_kernel_size: 3
299
+ stochastic_duration_predictor_dropout_rate: 0.5
300
+ stochastic_duration_predictor_flows: 4
301
+ stochastic_duration_predictor_dds_conv_layers: 3
302
+ vocabs: 85
303
+ aux_channels: 1025
304
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
305
+ discriminator_params:
306
+ scales: 1
307
+ scale_downsample_pooling: AvgPool1d
308
+ scale_downsample_pooling_params:
309
+ kernel_size: 4
310
+ stride: 2
311
+ padding: 2
312
+ scale_discriminator_params:
313
+ in_channels: 1
314
+ out_channels: 1
315
+ kernel_sizes:
316
+ - 15
317
+ - 41
318
+ - 5
319
+ - 3
320
+ channels: 128
321
+ max_downsample_channels: 1024
322
+ max_groups: 16
323
+ bias: true
324
+ downsample_scales:
325
+ - 2
326
+ - 2
327
+ - 4
328
+ - 4
329
+ - 1
330
+ nonlinear_activation: LeakyReLU
331
+ nonlinear_activation_params:
332
+ negative_slope: 0.1
333
+ use_weight_norm: true
334
+ use_spectral_norm: false
335
+ follow_official_norm: false
336
+ periods:
337
+ - 2
338
+ - 3
339
+ - 5
340
+ - 7
341
+ - 11
342
+ period_discriminator_params:
343
+ in_channels: 1
344
+ out_channels: 1
345
+ kernel_sizes:
346
+ - 5
347
+ - 3
348
+ channels: 32
349
+ downsample_scales:
350
+ - 3
351
+ - 3
352
+ - 3
353
+ - 3
354
+ - 1
355
+ max_downsample_channels: 1024
356
+ bias: true
357
+ nonlinear_activation: LeakyReLU
358
+ nonlinear_activation_params:
359
+ negative_slope: 0.1
360
+ use_weight_norm: true
361
+ use_spectral_norm: false
362
+ generator_adv_loss_params:
363
+ average_by_discriminators: false
364
+ loss_type: mse
365
+ discriminator_adv_loss_params:
366
+ average_by_discriminators: false
367
+ loss_type: mse
368
+ feat_match_loss_params:
369
+ average_by_discriminators: false
370
+ average_by_layers: false
371
+ include_final_outputs: true
372
+ mel_loss_params:
373
+ fs: 44100
374
+ n_fft: 2048
375
+ hop_length: 512
376
+ win_length: null
377
+ window: hann
378
+ n_mels: 80
379
+ fmin: 0
380
+ fmax: null
381
+ log_base: null
382
+ lambda_adv: 1.0
383
+ lambda_mel: 45.0
384
+ lambda_feat_match: 2.0
385
+ lambda_dur: 1.0
386
+ lambda_kl: 1.0
387
+ sampling_rate: 44100
388
+ cache_generator_outputs: true
389
+ pitch_extract: null
390
+ pitch_extract_conf: {}
391
+ pitch_normalize: null
392
+ pitch_normalize_conf: {}
393
+ energy_extract: null
394
+ energy_extract_conf: {}
395
+ energy_normalize: null
396
+ energy_normalize_conf: {}
397
+ required:
398
+ - output_dir
399
+ - token_list
400
+ version: '202301'
401
+ distributed: false
hooks/hook-espnet.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from PyInstaller.utils.hooks import copy_metadata
2
+
3
+ datas = copy_metadata('espnet')
hooks/hook-jamo.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from PyInstaller.utils.hooks import copy_metadata
2
+
3
+ datas = copy_metadata('jamo')
hooks/hook-librosa.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from PyInstaller.utils.hooks import copy_metadata
2
+
3
+ datas = copy_metadata('librosa')
hooks/hook-streamlit.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from PyInstaller.utils.hooks import copy_metadata
2
+
3
+ datas = copy_metadata('streamlit')
pre-fix/librosa/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/librosa/librosa/issues/1682
2
+
3
+ import lazy_loader as lazy
4
+ from .version import version as __version__
5
+
6
+ _filename = __file__
7
+ if _filename.endswith('.pyc'):
8
+ _filename = _filename[:-1]
9
+
10
+ __getattr__, __dir__, __all__ = lazy.attach_stub(__name__, _filename)
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+
3
+ # Install CPU version of pytorch
4
+ --extra-index-url https://download.pytorch.org/whl/cpu
5
+ torch
6
+
7
+ soundfile
8
+ espnet
9
+ espnet_model_zoo
10
+
11
+ # pyopenjtalk version must be 0.2
12
+ pyopenjtalk-prebuilt==0.2.0
13
+
14
+ # typeguard version must be 2.13.3(latest version python 3.8 is supported)
15
+ typeguard==2.13.3
16
+
17
+ # Use version < 3.7.0 as a workaround, otherwise pyinstallr fails to install some dlls
18
+ # https://github.com/pyinstaller/pyinstaller/pull/7505
19
+ # To visualize audio data
20
+ matplotlib<3.7.0
suga.jpg ADDED