Siddhant commited on
Commit
ad2eea0
1 Parent(s): ff79b53

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -10
app.py CHANGED
@@ -115,6 +115,7 @@ def int2float(sound):
115
  return sound
116
 
117
  text_str=""
 
118
  audio_output = None
119
  min_speech_ms=500
120
  max_speech_ms=float("inf")
@@ -146,14 +147,15 @@ LM_pipe(
146
  )
147
  end_event.record()
148
  torch.cuda.synchronize()
149
- vad_model, _ = torch.hub.load("snakers4/silero-vad:v4.0", "silero_vad")
150
- vad_iterator = VADIterator(
151
- vad_model,
152
- threshold=0.3,
153
- sampling_rate=16000,
154
- min_silence_duration_ms=250,
155
- speech_pad_ms=500,
156
- )
 
157
 
158
  import time
159
  def transcribe(stream, new_chunk):
@@ -162,6 +164,7 @@ def transcribe(stream, new_chunk):
162
  global chat
163
  global user_role
164
  global audio_output
 
165
 
166
  audio_int16 = np.frombuffer(y, dtype=np.int16)
167
  audio_float32 = int2float(audio_int16)
@@ -175,9 +178,23 @@ def transcribe(stream, new_chunk):
175
  print(log_mel_spectrogram)
176
  print(sr)
177
  print(audio_float32.shape)
178
- vad_output = vad_iterator(torch.from_numpy(audio_float32))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
- if vad_output is not None and len(vad_output) != 0:
181
  print("VAD: end of speech detected")
182
  array = torch.cat(vad_output).cpu().numpy()
183
  duration_ms = len(array) / sr * 1000
 
115
  return sound
116
 
117
  text_str=""
118
+ vad_output=None
119
  audio_output = None
120
  min_speech_ms=500
121
  max_speech_ms=float("inf")
 
147
  )
148
  end_event.record()
149
  torch.cuda.synchronize()
150
+ # vad_model, _ = torch.hub.load("snakers4/silero-vad:v4.0", "silero_vad")
151
+ # vad_iterator = VADIterator(
152
+ # vad_model,
153
+ # threshold=0.3,
154
+ # sampling_rate=16000,
155
+ # min_silence_duration_ms=250,
156
+ # speech_pad_ms=500,
157
+ # )
158
+ import webrtcvad
159
 
160
  import time
161
  def transcribe(stream, new_chunk):
 
164
  global chat
165
  global user_role
166
  global audio_output
167
+ global vad_output
168
 
169
  audio_int16 = np.frombuffer(y, dtype=np.int16)
170
  audio_float32 = int2float(audio_int16)
 
178
  print(log_mel_spectrogram)
179
  print(sr)
180
  print(audio_float32.shape)
181
+ # vad_output = vad_iterator(torch.from_numpy(audio_float32))
182
+ vad_count=0
183
+ for i in range(int(len(y)/960)):
184
+ vad = webrtcvad.Vad()
185
+ vad.set_mode(3)
186
+ if (vad.is_speech(y[i*960:(i+1)*960].tobytes(), orig_sr)):
187
+ vad_count+=1
188
+ if vad_count>10:
189
+ vad_curr=True
190
+ if vad_output is None:
191
+ vad_output=[torch.from_numpy(audio_float32)]
192
+ else:
193
+ vad_output.append(torch.from_numpy(audio_float32))
194
+ else:
195
+ vad_curr=False
196
 
197
+ if vad_output is not None and vad_curr==False:
198
  print("VAD: end of speech detected")
199
  array = torch.cat(vad_output).cpu().numpy()
200
  duration_ms = len(array) / sr * 1000