Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -115,6 +115,7 @@ def int2float(sound):
|
|
115 |
return sound
|
116 |
|
117 |
text_str=""
|
|
|
118 |
audio_output = None
|
119 |
min_speech_ms=500
|
120 |
max_speech_ms=float("inf")
|
@@ -146,14 +147,15 @@ LM_pipe(
|
|
146 |
)
|
147 |
end_event.record()
|
148 |
torch.cuda.synchronize()
|
149 |
-
vad_model, _ = torch.hub.load("snakers4/silero-vad:v4.0", "silero_vad")
|
150 |
-
vad_iterator = VADIterator(
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
)
|
|
|
157 |
|
158 |
import time
|
159 |
def transcribe(stream, new_chunk):
|
@@ -162,6 +164,7 @@ def transcribe(stream, new_chunk):
|
|
162 |
global chat
|
163 |
global user_role
|
164 |
global audio_output
|
|
|
165 |
|
166 |
audio_int16 = np.frombuffer(y, dtype=np.int16)
|
167 |
audio_float32 = int2float(audio_int16)
|
@@ -175,9 +178,23 @@ def transcribe(stream, new_chunk):
|
|
175 |
print(log_mel_spectrogram)
|
176 |
print(sr)
|
177 |
print(audio_float32.shape)
|
178 |
-
vad_output = vad_iterator(torch.from_numpy(audio_float32))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
|
180 |
-
if vad_output is not None and
|
181 |
print("VAD: end of speech detected")
|
182 |
array = torch.cat(vad_output).cpu().numpy()
|
183 |
duration_ms = len(array) / sr * 1000
|
|
|
115 |
return sound
|
116 |
|
117 |
text_str=""
|
118 |
+
vad_output=None
|
119 |
audio_output = None
|
120 |
min_speech_ms=500
|
121 |
max_speech_ms=float("inf")
|
|
|
147 |
)
|
148 |
end_event.record()
|
149 |
torch.cuda.synchronize()
|
150 |
+
# vad_model, _ = torch.hub.load("snakers4/silero-vad:v4.0", "silero_vad")
|
151 |
+
# vad_iterator = VADIterator(
|
152 |
+
# vad_model,
|
153 |
+
# threshold=0.3,
|
154 |
+
# sampling_rate=16000,
|
155 |
+
# min_silence_duration_ms=250,
|
156 |
+
# speech_pad_ms=500,
|
157 |
+
# )
|
158 |
+
import webrtcvad
|
159 |
|
160 |
import time
|
161 |
def transcribe(stream, new_chunk):
|
|
|
164 |
global chat
|
165 |
global user_role
|
166 |
global audio_output
|
167 |
+
global vad_output
|
168 |
|
169 |
audio_int16 = np.frombuffer(y, dtype=np.int16)
|
170 |
audio_float32 = int2float(audio_int16)
|
|
|
178 |
print(log_mel_spectrogram)
|
179 |
print(sr)
|
180 |
print(audio_float32.shape)
|
181 |
+
# vad_output = vad_iterator(torch.from_numpy(audio_float32))
|
182 |
+
vad_count=0
|
183 |
+
for i in range(int(len(y)/960)):
|
184 |
+
vad = webrtcvad.Vad()
|
185 |
+
vad.set_mode(3)
|
186 |
+
if (vad.is_speech(y[i*960:(i+1)*960].tobytes(), orig_sr)):
|
187 |
+
vad_count+=1
|
188 |
+
if vad_count>10:
|
189 |
+
vad_curr=True
|
190 |
+
if vad_output is None:
|
191 |
+
vad_output=[torch.from_numpy(audio_float32)]
|
192 |
+
else:
|
193 |
+
vad_output.append(torch.from_numpy(audio_float32))
|
194 |
+
else:
|
195 |
+
vad_curr=False
|
196 |
|
197 |
+
if vad_output is not None and vad_curr==False:
|
198 |
print("VAD: end of speech detected")
|
199 |
array = torch.cat(vad_output).cpu().numpy()
|
200 |
duration_ms = len(array) / sr * 1000
|