Fixed the issue where Silero-VAD version 5 could not recognize speech.
Browse filesRecently, the developer snakers4 released version 5 of Silero-VAD, which caused some audio files to fail in recognizing speech. Until a proper solution is found, we have reverted to using version 4 to avoid this issue.
https://github.com/snakers4/silero-vad/issues/515
Upgrade the project's Gradio version to 5.6.0
- README.md +1 -1
- app.py +11 -11
- requirements-fasterWhisper.txt +1 -1
- requirements-whisper.txt +1 -1
- requirements.txt +1 -1
- src/vad.py +67 -3
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: ✨
|
|
4 |
colorFrom: blue
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
4 |
colorFrom: blue
|
5 |
colorTo: purple
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.6.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
app.py
CHANGED
@@ -1365,8 +1365,8 @@ def create_ui(app_config: ApplicationConfig):
|
|
1365 |
else:
|
1366 |
print("Queue mode disabled - progress bars will not be shown.")
|
1367 |
|
1368 |
-
demo.launch(inbrowser=app_config.autolaunch, share=app_config.share, server_name=app_config.server_name, server_port=find_free_port()
|
1369 |
-
ssr_mode=False) # [Gradio 5.x] ERROR: Exception in ASGI application
|
1370 |
|
1371 |
# Clean up
|
1372 |
ui.close()
|
@@ -1438,15 +1438,15 @@ if __name__ == '__main__':
|
|
1438 |
|
1439 |
updated_config = default_app_config.update(**args)
|
1440 |
|
1441 |
-
|
1442 |
-
|
1443 |
-
|
1444 |
-
|
1445 |
-
|
1446 |
-
|
1447 |
-
|
1448 |
-
|
1449 |
-
|
1450 |
|
1451 |
try:
|
1452 |
if torch.cuda.is_available():
|
|
|
1365 |
else:
|
1366 |
print("Queue mode disabled - progress bars will not be shown.")
|
1367 |
|
1368 |
+
demo.launch(inbrowser=app_config.autolaunch, share=app_config.share, server_name=app_config.server_name, server_port=find_free_port())
|
1369 |
+
# ,ssr_mode=False) # [Gradio 5.x] ERROR: Exception in ASGI application
|
1370 |
|
1371 |
# Clean up
|
1372 |
ui.close()
|
|
|
1438 |
|
1439 |
updated_config = default_app_config.update(**args)
|
1440 |
|
1441 |
+
updated_config.whisper_implementation = "faster-whisper"
|
1442 |
+
updated_config.input_audio_max_duration = -1
|
1443 |
+
updated_config.default_model_name = "large-v2"
|
1444 |
+
updated_config.output_dir = "output"
|
1445 |
+
updated_config.vad_max_merge_size = 90
|
1446 |
+
updated_config.merge_subtitle_with_sources = False
|
1447 |
+
updated_config.autolaunch = True
|
1448 |
+
updated_config.auto_parallel = False
|
1449 |
+
updated_config.save_downloaded_files = True
|
1450 |
|
1451 |
try:
|
1452 |
if torch.cuda.is_available():
|
requirements-fasterWhisper.txt
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
ctranslate2>=4.4.0
|
3 |
faster-whisper>=1.0.3
|
4 |
ffmpeg-python==0.2.0
|
5 |
-
gradio==5.
|
6 |
yt-dlp
|
7 |
json5
|
8 |
torch
|
|
|
2 |
ctranslate2>=4.4.0
|
3 |
faster-whisper>=1.0.3
|
4 |
ffmpeg-python==0.2.0
|
5 |
+
gradio==5.6.0
|
6 |
yt-dlp
|
7 |
json5
|
8 |
torch
|
requirements-whisper.txt
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
ctranslate2>=4.4.0
|
3 |
git+https://github.com/openai/whisper.git
|
4 |
ffmpeg-python==0.2.0
|
5 |
-
gradio==5.
|
6 |
yt-dlp
|
7 |
json5
|
8 |
torch
|
|
|
2 |
ctranslate2>=4.4.0
|
3 |
git+https://github.com/openai/whisper.git
|
4 |
ffmpeg-python==0.2.0
|
5 |
+
gradio==5.6.0
|
6 |
yt-dlp
|
7 |
json5
|
8 |
torch
|
requirements.txt
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
ctranslate2>=4.4.0
|
3 |
faster-whisper>=1.0.3
|
4 |
ffmpeg-python==0.2.0
|
5 |
-
gradio==5.
|
6 |
yt-dlp
|
7 |
json5
|
8 |
torch
|
|
|
2 |
ctranslate2>=4.4.0
|
3 |
faster-whisper>=1.0.3
|
4 |
ffmpeg-python==0.2.0
|
5 |
+
gradio==5.6.0
|
6 |
yt-dlp
|
7 |
json5
|
8 |
torch
|
src/vad.py
CHANGED
@@ -508,13 +508,77 @@ class VadSileroTranscription(AbstractTranscription):
|
|
508 |
"""
|
509 |
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
|
510 |
https://github.com/snakers4/silero-vad
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
511 |
"""
|
512 |
repo_owner = "snakers4"
|
513 |
-
repo_name = "silero-vad"
|
514 |
ref = "master"
|
515 |
|
516 |
try:
|
517 |
-
model, utils = torch.hub.load(repo_or_dir=f'{repo_owner}/{repo_name}', model='silero_vad')
|
518 |
except Exception as e:
|
519 |
hub_dir = torch.hub.get_dir()
|
520 |
owner_name_branch = '_'.join([repo_owner, repo_name, ref])
|
@@ -547,7 +611,7 @@ class VadSileroTranscription(AbstractTranscription):
|
|
547 |
print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
|
548 |
wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
|
549 |
|
550 |
-
sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD)
|
551 |
seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
|
552 |
adjusted = self.adjust_timestamp(seconds_timestamps, adjust_seconds=chunk_start, max_source_time=chunk_start + chunk_duration)
|
553 |
|
|
|
508 |
"""
|
509 |
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
|
510 |
https://github.com/snakers4/silero-vad
|
511 |
+
def get_speech_timestamps(audio: torch.Tensor,
|
512 |
+
model,
|
513 |
+
threshold: float = 0.5,
|
514 |
+
sampling_rate: int = 16000,
|
515 |
+
min_speech_duration_ms: int = 250,
|
516 |
+
max_speech_duration_s: float = float('inf'),
|
517 |
+
min_silence_duration_ms: int = 100,
|
518 |
+
speech_pad_ms: int = 30,
|
519 |
+
return_seconds: bool = False,
|
520 |
+
visualize_probs: bool = False,
|
521 |
+
progress_tracking_callback: Callable[[float], None] = None,
|
522 |
+
neg_threshold: float = None,
|
523 |
+
window_size_samples: int = 512,):
|
524 |
+
|
525 |
+
This method is used for splitting long audios into speech chunks using silero VAD
|
526 |
+
|
527 |
+
Parameters
|
528 |
+
----------
|
529 |
+
audio: torch.Tensor, one dimensional
|
530 |
+
One dimensional float torch.Tensor, other types are casted to torch if possible
|
531 |
+
|
532 |
+
model: preloaded .jit/.onnx silero VAD model
|
533 |
+
|
534 |
+
threshold: float (default - 0.5)
|
535 |
+
Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
|
536 |
+
It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
537 |
+
|
538 |
+
sampling_rate: int (default - 16000)
|
539 |
+
Currently silero VAD models support 8000 and 16000 (or multiply of 16000) sample rates
|
540 |
+
|
541 |
+
min_speech_duration_ms: int (default - 250 milliseconds)
|
542 |
+
Final speech chunks shorter min_speech_duration_ms are thrown out
|
543 |
+
|
544 |
+
max_speech_duration_s: int (default - inf)
|
545 |
+
Maximum duration of speech chunks in seconds
|
546 |
+
Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that lasts more than 100ms (if any), to prevent agressive cutting.
|
547 |
+
Otherwise, they will be split aggressively just before max_speech_duration_s.
|
548 |
+
|
549 |
+
min_silence_duration_ms: int (default - 100 milliseconds)
|
550 |
+
In the end of each speech chunk wait for min_silence_duration_ms before separating it
|
551 |
+
|
552 |
+
speech_pad_ms: int (default - 30 milliseconds)
|
553 |
+
Final speech chunks are padded by speech_pad_ms each side
|
554 |
+
|
555 |
+
return_seconds: bool (default - False)
|
556 |
+
whether return timestamps in seconds (default - samples)
|
557 |
+
|
558 |
+
visualize_probs: bool (default - False)
|
559 |
+
whether draw prob hist or not
|
560 |
+
|
561 |
+
progress_tracking_callback: Callable[[float], None] (default - None)
|
562 |
+
callback function taking progress in percents as an argument
|
563 |
+
|
564 |
+
neg_threshold: float (default = threshold - 0.15)
|
565 |
+
Negative threshold (noise or exit threshold). If model's current state is SPEECH, values BELOW this value are considered as NON-SPEECH.
|
566 |
+
|
567 |
+
window_size_samples: int (default - 512 samples)
|
568 |
+
!!! DEPRECATED, DOES NOTHING !!!
|
569 |
+
|
570 |
+
Returns
|
571 |
+
----------
|
572 |
+
speeches: list of dicts
|
573 |
+
list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
|
574 |
+
https://github.com/snakers4/silero-vad/blob/master/src/silero_vad/utils_vad.py
|
575 |
"""
|
576 |
repo_owner = "snakers4"
|
577 |
+
repo_name = "silero-vad:v4.0" # https://github.com/snakers4/silero-vad/issues/515
|
578 |
ref = "master"
|
579 |
|
580 |
try:
|
581 |
+
model, utils = torch.hub.load(repo_or_dir=f'{repo_owner}/{repo_name}', model='silero_vad', trust_repo=True)
|
582 |
except Exception as e:
|
583 |
hub_dir = torch.hub.get_dir()
|
584 |
owner_name_branch = '_'.join([repo_owner, repo_name, ref])
|
|
|
611 |
print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
|
612 |
wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
|
613 |
|
614 |
+
sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD) #, neg_threshold=0.15, return_seconds=True
|
615 |
seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
|
616 |
adjusted = self.adjust_timestamp(seconds_timestamps, adjust_seconds=chunk_start, max_source_time=chunk_start + chunk_duration)
|
617 |
|