duwuonline commited on
Commit
d4d67a9
1 Parent(s): 155c929

Training in progress, step 10

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "facebook/mms-tts-vie",
3
  "activation_dropout": 0.1,
4
  "apply_spec_augment": true,
5
  "architectures": [
@@ -40,87 +40,43 @@
40
  "decoder_layerdrop": 0.1,
41
  "decoder_layers": 6,
42
  "decoder_start_token_id": 2,
43
- "depth_separable_channels": 2,
44
- "depth_separable_num_layers": 3,
45
- "duration_predictor_dropout": 0.5,
46
- "duration_predictor_filter_channels": 256,
47
- "duration_predictor_flow_bins": 10,
48
- "duration_predictor_kernel_size": 3,
49
- "duration_predictor_num_flows": 4,
50
- "duration_predictor_tail_bound": 5.0,
51
- "encoder_attention_heads": 2,
52
  "encoder_ffn_dim": 3072,
53
  "encoder_layerdrop": 0.1,
54
- "encoder_layers": 6,
55
  "encoder_max_relative_position": 160,
56
  "eos_token_id": 2,
57
  "feat_extract_activation": "gelu",
58
  "feat_extract_norm": "group",
59
  "feat_proj_dropout": 0.0,
60
- "ffn_dim": 768,
61
- "ffn_kernel_size": 3,
62
- "flow_size": 192,
63
  "guided_attention_loss_num_heads": 2,
64
  "guided_attention_loss_scale": 10.0,
65
  "guided_attention_loss_sigma": 0.4,
66
- "hidden_act": "relu",
67
  "hidden_dropout": 0.1,
68
- "hidden_size": 192,
69
  "initializer_range": 0.02,
70
  "is_encoder_decoder": true,
71
  "layer_norm_eps": 1e-05,
72
- "layerdrop": 0.1,
73
- "leaky_relu_slope": 0.1,
74
  "mask_feature_length": 10,
75
  "mask_feature_min_masks": 0,
76
  "mask_feature_prob": 0.0,
77
  "mask_time_length": 10,
78
  "mask_time_min_masks": 2,
79
  "mask_time_prob": 0.05,
80
- "max_speech_positions": 4000,
81
- "max_text_positions": 450,
 
82
  "model_type": "speecht5",
83
- "noise_scale": 0.667,
84
- "noise_scale_duration": 0.8,
85
  "num_conv_pos_embedding_groups": 16,
86
  "num_conv_pos_embeddings": 128,
87
  "num_feat_extract_layers": 7,
88
  "num_mel_bins": 80,
89
- "num_speakers": 1,
90
  "pad_token_id": 1,
91
  "positional_dropout": 0.1,
92
- "posterior_encoder_num_wavenet_layers": 16,
93
- "prior_encoder_num_flows": 4,
94
- "prior_encoder_num_wavenet_layers": 4,
95
  "reduction_factor": 2,
96
- "resblock_dilation_sizes": [
97
- [
98
- 1,
99
- 3,
100
- 5
101
- ],
102
- [
103
- 1,
104
- 3,
105
- 5
106
- ],
107
- [
108
- 1,
109
- 3,
110
- 5
111
- ]
112
- ],
113
- "resblock_kernel_sizes": [
114
- 3,
115
- 7,
116
- 11
117
- ],
118
- "sampling_rate": 16000,
119
  "scale_embedding": false,
120
  "speaker_embedding_dim": 512,
121
- "speaker_embedding_size": 0,
122
- "speaking_rate": 1.0,
123
- "spectrogram_bins": 513,
124
  "speech_decoder_postnet_dropout": 0.5,
125
  "speech_decoder_postnet_kernel": 5,
126
  "speech_decoder_postnet_layers": 5,
@@ -130,26 +86,7 @@
130
  "speech_decoder_prenet_units": 256,
131
  "torch_dtype": "float32",
132
  "transformers_version": "4.35.2",
133
- "upsample_initial_channel": 512,
134
- "upsample_kernel_sizes": [
135
- 16,
136
- 16,
137
- 4,
138
- 4
139
- ],
140
- "upsample_rates": [
141
- 8,
142
- 8,
143
- 2,
144
- 2
145
- ],
146
- "use_bias": true,
147
  "use_cache": false,
148
  "use_guided_attention_loss": true,
149
- "use_stochastic_duration_prediction": true,
150
- "vocab_size": 95,
151
- "wavenet_dilation_rate": 1,
152
- "wavenet_dropout": 0.0,
153
- "wavenet_kernel_size": 5,
154
- "window_size": 4
155
  }
 
1
  {
2
+ "_name_or_path": "microsoft/speecht5_tts",
3
  "activation_dropout": 0.1,
4
  "apply_spec_augment": true,
5
  "architectures": [
 
40
  "decoder_layerdrop": 0.1,
41
  "decoder_layers": 6,
42
  "decoder_start_token_id": 2,
43
+ "encoder_attention_heads": 12,
 
 
 
 
 
 
 
 
44
  "encoder_ffn_dim": 3072,
45
  "encoder_layerdrop": 0.1,
46
+ "encoder_layers": 12,
47
  "encoder_max_relative_position": 160,
48
  "eos_token_id": 2,
49
  "feat_extract_activation": "gelu",
50
  "feat_extract_norm": "group",
51
  "feat_proj_dropout": 0.0,
 
 
 
52
  "guided_attention_loss_num_heads": 2,
53
  "guided_attention_loss_scale": 10.0,
54
  "guided_attention_loss_sigma": 0.4,
55
+ "hidden_act": "gelu",
56
  "hidden_dropout": 0.1,
57
+ "hidden_size": 768,
58
  "initializer_range": 0.02,
59
  "is_encoder_decoder": true,
60
  "layer_norm_eps": 1e-05,
 
 
61
  "mask_feature_length": 10,
62
  "mask_feature_min_masks": 0,
63
  "mask_feature_prob": 0.0,
64
  "mask_time_length": 10,
65
  "mask_time_min_masks": 2,
66
  "mask_time_prob": 0.05,
67
+ "max_length": 1876,
68
+ "max_speech_positions": 1876,
69
+ "max_text_positions": 600,
70
  "model_type": "speecht5",
 
 
71
  "num_conv_pos_embedding_groups": 16,
72
  "num_conv_pos_embeddings": 128,
73
  "num_feat_extract_layers": 7,
74
  "num_mel_bins": 80,
 
75
  "pad_token_id": 1,
76
  "positional_dropout": 0.1,
 
 
 
77
  "reduction_factor": 2,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  "scale_embedding": false,
79
  "speaker_embedding_dim": 512,
 
 
 
80
  "speech_decoder_postnet_dropout": 0.5,
81
  "speech_decoder_postnet_kernel": 5,
82
  "speech_decoder_postnet_layers": 5,
 
86
  "speech_decoder_prenet_units": 256,
87
  "torch_dtype": "float32",
88
  "transformers_version": "4.35.2",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  "use_cache": false,
90
  "use_guided_attention_loss": true,
91
+ "vocab_size": 81
 
 
 
 
 
92
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5640bab1fd7f591bae9eb5d3a5106575e81f9de02d50ceed94227e450a198328
3
- size 73714424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e9a8ed04ba0fc2d1c16e528ae034a4371e416cff28e5ed277afcb1ccf82283b
3
+ size 577789320
runs/Jan18_11-52-35_71565f425e09/events.out.tfevents.1705578756.71565f425e09.1122.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63e30dd6de25685876d87e9f680b3b91a8870d850d8e2b87f3e62e41833d9019
3
+ size 6917
tokenizer_config.json CHANGED
@@ -59,5 +59,6 @@
59
  "processor_class": "SpeechT5Processor",
60
  "sp_model_kwargs": {},
61
  "tokenizer_class": "SpeechT5Tokenizer",
 
62
  "unk_token": "<unk>"
63
  }
 
59
  "processor_class": "SpeechT5Processor",
60
  "sp_model_kwargs": {},
61
  "tokenizer_class": "SpeechT5Tokenizer",
62
+ "trust_remote_code": false,
63
  "unk_token": "<unk>"
64
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d37d204bab59f02ac528e37c736f97443264a17423f3649029183d83011b5cc2
3
  size 4792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3545375cd1e95128155f1f4cc6c3d17df840b2a952535b7cd8b2ef2714a0550
3
  size 4792