add models
Browse files- README.md +82 -1
- model_cpu.pt +3 -0
- model_cuda.pt +3 -0
- tracing_code.py +92 -0
README.md
CHANGED
@@ -1,3 +1,84 @@
|
|
1 |
---
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
language: "en"
|
3 |
+
thumbnail:
|
4 |
+
tags:
|
5 |
+
- embeddings
|
6 |
+
- Speaker
|
7 |
+
- Verification
|
8 |
+
- Identification
|
9 |
+
- pytorch
|
10 |
+
- xvectors
|
11 |
+
- TDNN
|
12 |
+
- speechbrain
|
13 |
+
- audio-classification
|
14 |
+
license: "apache-2.0"
|
15 |
+
datasets:
|
16 |
+
- voxceleb
|
17 |
+
inference: false
|
18 |
---
|
19 |
+
|
20 |
+
# Xvector embeddings extraction on Voxceleb
|
21 |
+
|
22 |
+
This repository provides all the necessary tools to extract speaker embeddings with a pretrained TDNN model with SpeechBrain.
|
23 |
+
The system is trained on Voxceleb 1+ Voxceleb2 training data.
|
24 |
+
|
25 |
+
This repo traces the model shared at: https://huggingface.co/speechbrain/spkrec-xvect-voxceleb
|
26 |
+
|
27 |
+
For using this model, SpeechBrain is not required.
|
28 |
+
|
29 |
+
To use the model:
|
30 |
+
|
31 |
+
```python
|
32 |
+
import torchaudio
|
33 |
+
import torch
|
34 |
+
|
35 |
+
device = "cpu" # or "cuda" for loading the model to a GPU
|
36 |
+
model = torch.jit.load(f"<this_repo>/model_{device}.pt")
|
37 |
+
wavsignal, fs = torchaudio.load('audio.wav')
|
38 |
+
embeddings = model(wavsignal).squeeze()
|
39 |
+
```
|
40 |
+
|
41 |
+
## Warning
|
42 |
+
|
43 |
+
This model can only forward 1 wave audio.
|
44 |
+
Also, this model was traced for speaker embedding extraction,
|
45 |
+
so any additional task (Speaker Classification, Verification) is disabled.
|
46 |
+
|
47 |
+
You can retrace the model using the code shared at: "tracing_code.py"
|
48 |
+
|
49 |
+
## All the credits to the SpeechBrain Team.
|
50 |
+
|
51 |
+
### Limitations
|
52 |
+
The SpeechBrain team does not provide any warranty on the performance achieved by this model when used on other datasets.
|
53 |
+
|
54 |
+
#### Referencing xvectors
|
55 |
+
```@inproceedings{DBLP:conf/odyssey/SnyderGMSPK18,
|
56 |
+
author = {David Snyder and
|
57 |
+
Daniel Garcia{-}Romero and
|
58 |
+
Alan McCree and
|
59 |
+
Gregory Sell and
|
60 |
+
Daniel Povey and
|
61 |
+
Sanjeev Khudanpur},
|
62 |
+
title = {Spoken Language Recognition using X-vectors},
|
63 |
+
booktitle = {Odyssey 2018},
|
64 |
+
pages = {105--111},
|
65 |
+
year = {2018},
|
66 |
+
}
|
67 |
+
```
|
68 |
+
|
69 |
+
|
70 |
+
# **Citing SpeechBrain**
|
71 |
+
Please, cite SpeechBrain if you use it for your research or business.
|
72 |
+
|
73 |
+
|
74 |
+
```bibtex
|
75 |
+
@misc{speechbrain,
|
76 |
+
title={{SpeechBrain}: A General-Purpose Speech Toolkit},
|
77 |
+
author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio},
|
78 |
+
year={2021},
|
79 |
+
eprint={2106.04624},
|
80 |
+
archivePrefix={arXiv},
|
81 |
+
primaryClass={eess.AS},
|
82 |
+
note={arXiv:2106.04624}
|
83 |
+
}
|
84 |
+
```
|
model_cpu.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e1acfffabc6ba019df1f9905fc8637e27052075142688d56700bf937edb7e838
|
3 |
+
size 16961987
|
model_cuda.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:07568c618fa9f7b164f29b7e6cb2aa56a89240b5cdbaf7857f8e80fadebaa76a
|
3 |
+
size 16962241
|
tracing_code.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
import torchaudio
|
5 |
+
import torch
|
6 |
+
from torch import nn
|
7 |
+
|
8 |
+
from speechbrain.lobes.models.Xvector import Xvector
|
9 |
+
from speechbrain.lobes.features import Fbank
|
10 |
+
from speechbrain.processing.features import InputNormalization
|
11 |
+
|
12 |
+
|
13 |
+
class Extractor(nn.Module):
|
14 |
+
model_dict = [
|
15 |
+
"mean_var_norm",
|
16 |
+
"compute_features",
|
17 |
+
"embedding_model",
|
18 |
+
"mean_var_norm_emb",
|
19 |
+
]
|
20 |
+
def __init__(self, model_path, n_mels=24, device="cpu"):
|
21 |
+
super().__init__()
|
22 |
+
self.device = device
|
23 |
+
self.compute_features = Fbank(n_mels=n_mels)
|
24 |
+
self.mean_var_norm = InputNormalization(norm_type="sentence", std_norm=False)
|
25 |
+
self.embedding_model = Xvector(
|
26 |
+
in_channels = n_mels,
|
27 |
+
activation = torch.nn.LeakyReLU,
|
28 |
+
tdnn_blocks = 5,
|
29 |
+
tdnn_channels = [512, 512, 512, 512, 1500],
|
30 |
+
tdnn_kernel_sizes = [5, 3, 3, 1, 1],
|
31 |
+
tdnn_dilations = [1, 2, 3, 1, 1],
|
32 |
+
lin_neurons = 512,
|
33 |
+
)
|
34 |
+
self.mean_var_norm_emb = InputNormalization(norm_type="global", std_norm=False)
|
35 |
+
for mod_name in self.model_dict:
|
36 |
+
filename = os.path.join(model_path, f"{mod_name}.ckpt")
|
37 |
+
module = getattr(self, mod_name)
|
38 |
+
if os.path.exists(filename):
|
39 |
+
if hasattr(module, "_load"):
|
40 |
+
print(f"Load: {filename}")
|
41 |
+
module._load(filename)
|
42 |
+
else:
|
43 |
+
print(f"Load State Dict: {filename}")
|
44 |
+
module.load_state_dict(torch.load(filename))
|
45 |
+
module.to(self.device)
|
46 |
+
|
47 |
+
@torch.no_grad()
|
48 |
+
def forward(self, wavs, wav_lens = None, normalize=False):
|
49 |
+
# Manage single waveforms in input
|
50 |
+
if len(wavs.shape) == 1:
|
51 |
+
wavs = wavs.unsqueeze(0)
|
52 |
+
|
53 |
+
# Assign full length if wav_lens is not assigned
|
54 |
+
if wav_lens is None:
|
55 |
+
wav_lens = torch.ones(wavs.shape[0], device=self.device)
|
56 |
+
|
57 |
+
# Storing waveform in the specified device
|
58 |
+
wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
|
59 |
+
wavs = wavs.float()
|
60 |
+
|
61 |
+
# Computing features and embeddings
|
62 |
+
feats = self.compute_features(wavs)
|
63 |
+
feats = self.mean_var_norm(feats, wav_lens)
|
64 |
+
embeddings = self.embedding_model(feats, wav_lens)
|
65 |
+
if normalize:
|
66 |
+
embeddings = self.mean_var_norm_emb(
|
67 |
+
embeddings, torch.ones(embeddings.shape[0], device=self.device)
|
68 |
+
)
|
69 |
+
return embeddings
|
70 |
+
|
71 |
+
|
72 |
+
MODEL_PATH = "pretrained_models/spkrec-xvect-voxceleb"
|
73 |
+
signal, fs = torchaudio.load('audio.wav')
|
74 |
+
|
75 |
+
device = "cuda"
|
76 |
+
extractor = Extractor(MODEL_PATH, device=device)
|
77 |
+
|
78 |
+
for k, p in extractor.named_parameters():
|
79 |
+
p.requires_grad = False
|
80 |
+
|
81 |
+
extractor.eval()
|
82 |
+
embeddings_x = extractor(signal).cpu().squeeze()
|
83 |
+
|
84 |
+
# Tracing
|
85 |
+
traced_model = torch.jit.trace(extractor, signal)
|
86 |
+
torch.jit.save(traced_model, f"model_{device}.pt")
|
87 |
+
embeddings_t = traced_model(signal).squeeze()
|
88 |
+
print(embeddings_t)
|
89 |
+
|
90 |
+
model = torch.jit.load(f"model_{device}.pt")
|
91 |
+
emb_m = model(signal).squeeze()
|
92 |
+
print(emb_m)
|