yangwang825 commited on
Commit
c3da416
·
verified ·
1 Parent(s): 7f883e2

Upload EcapaTdnnForSequenceClassification

Browse files
Files changed (3) hide show
  1. config.json +6 -2
  2. model.safetensors +1 -1
  3. tdnn_attention.py +12 -7
config.json CHANGED
@@ -1,11 +1,14 @@
1
  {
2
- "_attn_implementation_autoset": true,
3
  "angular": true,
4
  "angular_margin": 0.2,
5
  "angular_scale": 30,
 
 
 
6
  "attention_channels": 128,
7
  "auto_map": {
8
- "AutoConfig": "configuration_ecapa_tdnn.EcapaTdnnConfig"
 
9
  },
10
  "bos_token_id": 1,
11
  "decoder_config": {
@@ -2577,6 +2580,7 @@
2577
  },
2578
  "time_masks": 5,
2579
  "time_width": 0.03,
 
2580
  "transformers_version": "4.48.3",
2581
  "use_torchaudio": true,
2582
  "use_vectorized_spec_augment": true,
 
1
  {
 
2
  "angular": true,
3
  "angular_margin": 0.2,
4
  "angular_scale": 30,
5
+ "architectures": [
6
+ "EcapaTdnnForSequenceClassification"
7
+ ],
8
  "attention_channels": 128,
9
  "auto_map": {
10
+ "AutoConfig": "configuration_ecapa_tdnn.EcapaTdnnConfig",
11
+ "AutoModelForAudioClassification": "modeling_ecapa_tdnn.EcapaTdnnForSequenceClassification"
12
  },
13
  "bos_token_id": 1,
14
  "decoder_config": {
 
2580
  },
2581
  "time_masks": 5,
2582
  "time_width": 0.03,
2583
+ "torch_dtype": "float32",
2584
  "transformers_version": "4.48.3",
2585
  "use_torchaudio": true,
2586
  "use_vectorized_spec_augment": true,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed569fa8302895e8669d9e8139955f604b34f042b9eb9002ea462953c0f48d4a
3
  size 26039912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4da89b0b6d405974f1e332bdc9945fae76222d7ddf0f955653fba9a00cca0339
3
  size 26039912
tdnn_attention.py CHANGED
@@ -214,16 +214,21 @@ class MaskedSEModule(nn.Module):
214
  nn.Sigmoid(),
215
  )
216
 
217
- def forward(self, input, length=None):
 
 
 
 
218
  if length is None:
219
- x = torch.mean(input, dim=2, keep_dim=True)
220
  else:
221
- max_len = input.size(2)
222
- mask, num_values = lens_to_mask(length, max_len=max_len, device=input.device)
223
- x = torch.sum((input * mask), dim=2, keepdim=True) / (num_values)
224
-
 
225
  out = self.se_layer(x)
226
- return out * input
227
 
228
 
229
  class TdnnSeModule(nn.Module):
 
214
  nn.Sigmoid(),
215
  )
216
 
217
+ def forward(self, inputs, length=None):
218
+ """
219
+ inputs: tensor shape of (B, D, T)
220
+ outputs: tensor shape of (B, D, 1)
221
+ """
222
  if length is None:
223
+ x = torch.mean(inputs, dim=2, keep_dim=True)
224
  else:
225
+ max_len = inputs.size(2)
226
+ # shape of `mask` is (B, 1, T) and shape of `num_values` is (B, 1, 1)
227
+ mask, num_values = lens_to_mask(length, max_len=max_len, device=inputs.device)
228
+ # shape of `x` is (B, D, 1)
229
+ x = torch.sum((inputs * mask), dim=2, keepdim=True) / (num_values)
230
  out = self.se_layer(x)
231
+ return out * inputs
232
 
233
 
234
  class TdnnSeModule(nn.Module):