Liangcd commited on
Commit
e3feded
1 Parent(s): d6e0b7c

[demo] refine app.py

Browse files
Files changed (2) hide show
  1. app.py +6 -53
  2. requirements.txt +2 -3
app.py CHANGED
@@ -14,10 +14,7 @@
14
  # limitations under the License.
15
 
16
  import gradio as gr
17
- import torchaudio
18
- import torchaudio.compliance.kaldi as kaldi
19
- import torch
20
- import onnxruntime as ort
21
  from sklearn.metrics.pairwise import cosine_similarity
22
 
23
  STYLE = """
@@ -49,53 +46,9 @@ OUTPUT_ERROR = (STYLE + """
49
  </div>
50
  """)
51
 
 
 
52
 
53
- def compute_fbank(wav_path,
54
- num_bel_bins=80,
55
- frame_length=25,
56
- frame_shift=10,
57
- dither=0.0,
58
- resample_rate=16000):
59
- """ Extract fbank, simlilar to the one in wespeaker.dataset.processor,
60
- While integrating the wave reading and CMN.
61
- """
62
- waveform, sample_rate = torchaudio.load(wav_path)
63
- # resample
64
- if sample_rate != resample_rate:
65
- waveform = torchaudio.transforms.Resample(
66
- orig_freq=sample_rate, new_freq=resample_rate)(waveform)
67
- waveform = waveform * (1 << 15)
68
- mat = kaldi.fbank(waveform,
69
- num_mel_bins=num_bel_bins,
70
- frame_length=frame_length,
71
- frame_shift=frame_shift,
72
- dither=dither,
73
- sample_frequency=sample_rate,
74
- window_type='hamming',
75
- use_energy=False)
76
- # CMN, without CVN
77
- mat = mat - torch.mean(mat, dim=0)
78
- return mat
79
-
80
-
81
- class OnnxModel(object):
82
-
83
- def __init__(self, model_path):
84
- so = ort.SessionOptions()
85
- so.inter_op_num_threads = 1
86
- so.intra_op_num_threads = 1
87
- self.session = ort.InferenceSession(model_path, sess_options=so)
88
-
89
- def extract_embedding(self, wav_path):
90
- feats = compute_fbank(wav_path)
91
- feats = feats.unsqueeze(0).numpy()
92
-
93
- embeddings = self.session.run(output_names=['embs'],
94
- input_feed={'feats': feats})
95
- return embeddings[0]
96
-
97
- vox_model = OnnxModel('pre_model/voxceleb_resnet34.onnx')
98
- cnc_model = OnnxModel('pre_model/cnceleb_resnet34.onnx')
99
 
100
  def speaker_verification(audio_path1, audio_path2, lang='CN'):
101
  if audio_path1 == None or audio_path2 == None:
@@ -108,13 +61,13 @@ def speaker_verification(audio_path1, audio_path2, lang='CN'):
108
  else:
109
  output = OUTPUT_ERROR.format('Please select a language')
110
  return output
111
- emb1 = model.extract_embedding(audio_path1)
112
- emb2 = model.extract_embedding(audio_path2)
113
  cos_score = cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1,
114
  -1))[0][0]
115
  cos_score = (cos_score + 1) / 2.0
116
 
117
- if cos_score >= 0.73:
118
  output = OUTPUT_OK.format(cos_score * 100)
119
  else:
120
  output = OUTPUT_FAIL.format(cos_score * 100)
 
14
  # limitations under the License.
15
 
16
  import gradio as gr
17
+ import wespeakerruntime as wespeaker
 
 
 
18
  from sklearn.metrics.pairwise import cosine_similarity
19
 
20
  STYLE = """
 
46
  </div>
47
  """)
48
 
49
+ vox_model = wespeaker.Inference('pre_model/voxceleb_resnet34.onnx')
50
+ cnc_model = wespeaker.Inference('pre_model/cnceleb_resnet34.onnx')
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  def speaker_verification(audio_path1, audio_path2, lang='CN'):
54
  if audio_path1 == None or audio_path2 == None:
 
61
  else:
62
  output = OUTPUT_ERROR.format('Please select a language')
63
  return output
64
+ emb1 = model.extract_embedding_wav(audio_path1)
65
+ emb2 = model.extract_embedding_wav(audio_path2)
66
  cos_score = cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1,
67
  -1))[0][0]
68
  cos_score = (cos_score + 1) / 2.0
69
 
70
+ if cos_score >= 0.70:
71
  output = OUTPUT_OK.format(cos_score * 100)
72
  else:
73
  output = OUTPUT_FAIL.format(cos_score * 100)
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
- onnxruntime==1.11.1
2
  gradio
3
- torchaudio
4
- scikit-learn
 
 
1
  gradio
2
+ wespeakerruntime
3
+ scikit-learn