yangwang825
/

ecapa-tdnn-voxceleb1-c512-aam

Audio Classification

Generated from Trainer

Model card Files Files and versions Community

yangwang825 commited on 2 days ago

Commit

c3da416

·

verified ·

1 Parent(s): 7f883e2

Upload EcapaTdnnForSequenceClassification

Files changed (3) hide show

config.json +6 -2
model.safetensors +1 -1
tdnn_attention.py +12 -7

config.json CHANGED Viewed

@@ -1,11 +1,14 @@
 {
-  "_attn_implementation_autoset": true,
   "angular": true,
   "angular_margin": 0.2,
   "angular_scale": 30,
   "attention_channels": 128,
   "auto_map": {
-    "AutoConfig": "configuration_ecapa_tdnn.EcapaTdnnConfig"
   },
   "bos_token_id": 1,
   "decoder_config": {
@@ -2577,6 +2580,7 @@
   },
   "time_masks": 5,
   "time_width": 0.03,
   "transformers_version": "4.48.3",
   "use_torchaudio": true,
   "use_vectorized_spec_augment": true,

 {
   "angular": true,
   "angular_margin": 0.2,
   "angular_scale": 30,
+  "architectures": [
+    "EcapaTdnnForSequenceClassification"
+  ],
   "attention_channels": 128,
   "auto_map": {
+    "AutoConfig": "configuration_ecapa_tdnn.EcapaTdnnConfig",
+    "AutoModelForAudioClassification": "modeling_ecapa_tdnn.EcapaTdnnForSequenceClassification"
   },
   "bos_token_id": 1,
   "decoder_config": {
   },
   "time_masks": 5,
   "time_width": 0.03,
+  "torch_dtype": "float32",
   "transformers_version": "4.48.3",
   "use_torchaudio": true,
   "use_vectorized_spec_augment": true,

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed569fa8302895e8669d9e8139955f604b34f042b9eb9002ea462953c0f48d4a
 size 26039912

 version https://git-lfs.github.com/spec/v1
+oid sha256:4da89b0b6d405974f1e332bdc9945fae76222d7ddf0f955653fba9a00cca0339
 size 26039912

tdnn_attention.py CHANGED Viewed

@@ -214,16 +214,21 @@ class MaskedSEModule(nn.Module):
             nn.Sigmoid(),
         )
-    def forward(self, input, length=None):
         if length is None:
-            x = torch.mean(input, dim=2, keep_dim=True)
         else:
-            max_len = input.size(2)
-            mask, num_values = lens_to_mask(length, max_len=max_len, device=input.device)
-            x = torch.sum((input * mask), dim=2, keepdim=True) / (num_values)
         out = self.se_layer(x)
-        return out * input
 class TdnnSeModule(nn.Module):

             nn.Sigmoid(),
         )
+    def forward(self, inputs, length=None):
+        """
+        inputs: tensor shape of (B, D, T)
+        outputs: tensor shape of (B, D, 1)
+        """
         if length is None:
+            x = torch.mean(inputs, dim=2, keep_dim=True)
         else:
+            max_len = inputs.size(2)
+            # shape of `mask` is (B, 1, T) and shape of `num_values` is (B, 1, 1)
+            mask, num_values = lens_to_mask(length, max_len=max_len, device=inputs.device)
+            # shape of `x` is (B, D, 1)
+            x = torch.sum((inputs * mask), dim=2, keepdim=True) / (num_values)
         out = self.se_layer(x)
+        return out * inputs
 class TdnnSeModule(nn.Module):