Update models
Browse files- asr.ckpt +2 -2
- hyperparams.yaml +16 -19
- lm.ckpt +2 -2
- normalizer.ckpt +2 -2
- tokenizer.ckpt +2 -2
asr.ckpt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8e718dc29b403dfaa8d2604c43c3666be3fa99e958b77e3c6ff387e94d4a174c
|
3 |
+
size 184546287
|
hyperparams.yaml
CHANGED
@@ -29,7 +29,6 @@ vocab_size: 5000
|
|
29 |
|
30 |
# Outputs
|
31 |
blank_index: 0
|
32 |
-
label_smoothing: 0.1
|
33 |
pad_index: 0
|
34 |
bos_index: 1
|
35 |
eos_index: 2
|
@@ -38,10 +37,8 @@ unk_index: 0
|
|
38 |
# Decoding parameters
|
39 |
min_decode_ratio: 0.0
|
40 |
max_decode_ratio: 1.0
|
41 |
-
|
42 |
-
|
43 |
-
test_beam_size: 60
|
44 |
-
lm_weight: 0.20
|
45 |
ctc_weight_decode: 0.40
|
46 |
|
47 |
############################## models ################################
|
@@ -51,15 +48,15 @@ normalizer: !new:speechbrain.processing.features.InputNormalization
|
|
51 |
|
52 |
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
|
53 |
input_shape: (8, 10, 80)
|
54 |
-
num_blocks:
|
55 |
num_layers_per_block: 1
|
56 |
-
out_channels: (64,
|
57 |
-
kernel_sizes: (
|
58 |
-
strides: (2, 2)
|
59 |
-
residuals: (False, False)
|
60 |
-
|
61 |
Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
|
62 |
-
input_size:
|
63 |
tgt_vocab: !ref <output_neurons>
|
64 |
d_model: !ref <d_model>
|
65 |
nhead: !ref <nhead>
|
@@ -106,11 +103,14 @@ decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
|
|
106 |
ctc_weight: !ref <ctc_weight_decode>
|
107 |
lm_weight: !ref <lm_weight>
|
108 |
lm_modules: !ref <lm_model>
|
109 |
-
temperature: 1.
|
110 |
-
temperature_lm: 1.
|
111 |
using_eos_threshold: False
|
112 |
length_normalization: True
|
113 |
|
|
|
|
|
|
|
114 |
Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
|
115 |
transformer: !ref <Transformer>
|
116 |
|
@@ -122,11 +122,7 @@ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
|
|
122 |
transformer_encoder: !ref <Tencoder>
|
123 |
|
124 |
asr_model: !new:torch.nn.ModuleList
|
125 |
-
- [!ref <
|
126 |
-
|
127 |
-
log_softmax: !new:torch.nn.LogSoftmax
|
128 |
-
dim: -1
|
129 |
-
|
130 |
|
131 |
compute_features: !new:speechbrain.lobes.features.Fbank
|
132 |
sample_rate: !ref <sample_rate>
|
@@ -142,6 +138,7 @@ modules:
|
|
142 |
lm_model: !ref <lm_model>
|
143 |
encoder: !ref <encoder>
|
144 |
decoder: !ref <decoder>
|
|
|
145 |
# The pretrainer allows a mapping between pretrained files and instances that
|
146 |
# are declared in the yaml.
|
147 |
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
|
|
29 |
|
30 |
# Outputs
|
31 |
blank_index: 0
|
|
|
32 |
pad_index: 0
|
33 |
bos_index: 1
|
34 |
eos_index: 2
|
|
|
37 |
# Decoding parameters
|
38 |
min_decode_ratio: 0.0
|
39 |
max_decode_ratio: 1.0
|
40 |
+
test_beam_size: 10
|
41 |
+
lm_weight: 0.0
|
|
|
|
|
42 |
ctc_weight_decode: 0.40
|
43 |
|
44 |
############################## models ################################
|
|
|
48 |
|
49 |
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
|
50 |
input_shape: (8, 10, 80)
|
51 |
+
num_blocks: 3
|
52 |
num_layers_per_block: 1
|
53 |
+
out_channels: (64, 64, 64)
|
54 |
+
kernel_sizes: (5, 5, 1)
|
55 |
+
strides: (2, 2, 1)
|
56 |
+
residuals: (False, False, True)
|
57 |
+
|
58 |
Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
|
59 |
+
input_size: 1280
|
60 |
tgt_vocab: !ref <output_neurons>
|
61 |
d_model: !ref <d_model>
|
62 |
nhead: !ref <nhead>
|
|
|
103 |
ctc_weight: !ref <ctc_weight_decode>
|
104 |
lm_weight: !ref <lm_weight>
|
105 |
lm_modules: !ref <lm_model>
|
106 |
+
temperature: 1.30
|
107 |
+
temperature_lm: 1.30
|
108 |
using_eos_threshold: False
|
109 |
length_normalization: True
|
110 |
|
111 |
+
log_softmax: !new:torch.nn.LogSoftmax
|
112 |
+
dim: -1
|
113 |
+
|
114 |
Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
|
115 |
transformer: !ref <Transformer>
|
116 |
|
|
|
122 |
transformer_encoder: !ref <Tencoder>
|
123 |
|
124 |
asr_model: !new:torch.nn.ModuleList
|
125 |
+
- [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
|
|
|
|
|
|
|
|
|
126 |
|
127 |
compute_features: !new:speechbrain.lobes.features.Fbank
|
128 |
sample_rate: !ref <sample_rate>
|
|
|
138 |
lm_model: !ref <lm_model>
|
139 |
encoder: !ref <encoder>
|
140 |
decoder: !ref <decoder>
|
141 |
+
|
142 |
# The pretrainer allows a mapping between pretrained files and instances that
|
143 |
# are declared in the yaml.
|
144 |
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
|
lm.ckpt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f0b49d5e1f9894c0c9f2ec21c8658da8e1a07f509b807e8624450ba19ea667c
|
3 |
+
size 381072461
|
normalizer.ckpt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1da2ced935d955c014177591249e5db497d0c5dc7143e64378da0cb5590fe77a
|
3 |
+
size 1703
|
tokenizer.ckpt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d419e55734c26df6c5690671be2b887a7db389c1a7f63286111ce737508c6569
|
3 |
+
size 313900
|