avans06 commited on
Commit
e146f3a
1 Parent(s): f360106

Fixed the issue where Silero-VAD version 5 could not recognize speech.

Browse files

Recently, the developer snakers4 released version 5 of Silero-VAD, which caused some audio files to fail in recognizing speech. Until a proper solution is found, we have reverted to using version 4 to avoid this issue.
https://github.com/snakers4/silero-vad/issues/515

Upgrade the project's Gradio version to 5.6.0

README.md CHANGED
@@ -4,7 +4,7 @@ emoji: ✨
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.4.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.6.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -1365,8 +1365,8 @@ def create_ui(app_config: ApplicationConfig):
1365
  else:
1366
  print("Queue mode disabled - progress bars will not be shown.")
1367
 
1368
- demo.launch(inbrowser=app_config.autolaunch, share=app_config.share, server_name=app_config.server_name, server_port=find_free_port(),
1369
- ssr_mode=False) # [Gradio 5.x] ERROR: Exception in ASGI application
1370
 
1371
  # Clean up
1372
  ui.close()
@@ -1438,15 +1438,15 @@ if __name__ == '__main__':
1438
 
1439
  updated_config = default_app_config.update(**args)
1440
 
1441
- # updated_config.whisper_implementation = "faster-whisper"
1442
- # updated_config.input_audio_max_duration = -1
1443
- # updated_config.default_model_name = "large-v2"
1444
- # updated_config.output_dir = "output"
1445
- # updated_config.vad_max_merge_size = 90
1446
- # updated_config.merge_subtitle_with_sources = False
1447
- # updated_config.autolaunch = True
1448
- # updated_config.auto_parallel = False
1449
- # updated_config.save_downloaded_files = True
1450
 
1451
  try:
1452
  if torch.cuda.is_available():
 
1365
  else:
1366
  print("Queue mode disabled - progress bars will not be shown.")
1367
 
1368
+ demo.launch(inbrowser=app_config.autolaunch, share=app_config.share, server_name=app_config.server_name, server_port=find_free_port())
1369
+ # ,ssr_mode=False) # [Gradio 5.x] ERROR: Exception in ASGI application
1370
 
1371
  # Clean up
1372
  ui.close()
 
1438
 
1439
  updated_config = default_app_config.update(**args)
1440
 
1441
+ updated_config.whisper_implementation = "faster-whisper"
1442
+ updated_config.input_audio_max_duration = -1
1443
+ updated_config.default_model_name = "large-v2"
1444
+ updated_config.output_dir = "output"
1445
+ updated_config.vad_max_merge_size = 90
1446
+ updated_config.merge_subtitle_with_sources = False
1447
+ updated_config.autolaunch = True
1448
+ updated_config.auto_parallel = False
1449
+ updated_config.save_downloaded_files = True
1450
 
1451
  try:
1452
  if torch.cuda.is_available():
requirements-fasterWhisper.txt CHANGED
@@ -2,7 +2,7 @@
2
  ctranslate2>=4.4.0
3
  faster-whisper>=1.0.3
4
  ffmpeg-python==0.2.0
5
- gradio==5.4.0
6
  yt-dlp
7
  json5
8
  torch
 
2
  ctranslate2>=4.4.0
3
  faster-whisper>=1.0.3
4
  ffmpeg-python==0.2.0
5
+ gradio==5.6.0
6
  yt-dlp
7
  json5
8
  torch
requirements-whisper.txt CHANGED
@@ -2,7 +2,7 @@
2
  ctranslate2>=4.4.0
3
  git+https://github.com/openai/whisper.git
4
  ffmpeg-python==0.2.0
5
- gradio==5.4.0
6
  yt-dlp
7
  json5
8
  torch
 
2
  ctranslate2>=4.4.0
3
  git+https://github.com/openai/whisper.git
4
  ffmpeg-python==0.2.0
5
+ gradio==5.6.0
6
  yt-dlp
7
  json5
8
  torch
requirements.txt CHANGED
@@ -2,7 +2,7 @@
2
  ctranslate2>=4.4.0
3
  faster-whisper>=1.0.3
4
  ffmpeg-python==0.2.0
5
- gradio==5.4.0
6
  yt-dlp
7
  json5
8
  torch
 
2
  ctranslate2>=4.4.0
3
  faster-whisper>=1.0.3
4
  ffmpeg-python==0.2.0
5
+ gradio==5.6.0
6
  yt-dlp
7
  json5
8
  torch
src/vad.py CHANGED
@@ -508,13 +508,77 @@ class VadSileroTranscription(AbstractTranscription):
508
  """
509
  (get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
510
  https://github.com/snakers4/silero-vad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  """
512
  repo_owner = "snakers4"
513
- repo_name = "silero-vad"
514
  ref = "master"
515
 
516
  try:
517
- model, utils = torch.hub.load(repo_or_dir=f'{repo_owner}/{repo_name}', model='silero_vad')
518
  except Exception as e:
519
  hub_dir = torch.hub.get_dir()
520
  owner_name_branch = '_'.join([repo_owner, repo_name, ref])
@@ -547,7 +611,7 @@ class VadSileroTranscription(AbstractTranscription):
547
  print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
548
  wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
549
 
550
- sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD)
551
  seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
552
  adjusted = self.adjust_timestamp(seconds_timestamps, adjust_seconds=chunk_start, max_source_time=chunk_start + chunk_duration)
553
 
 
508
  """
509
  (get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
510
  https://github.com/snakers4/silero-vad
511
+ def get_speech_timestamps(audio: torch.Tensor,
512
+ model,
513
+ threshold: float = 0.5,
514
+ sampling_rate: int = 16000,
515
+ min_speech_duration_ms: int = 250,
516
+ max_speech_duration_s: float = float('inf'),
517
+ min_silence_duration_ms: int = 100,
518
+ speech_pad_ms: int = 30,
519
+ return_seconds: bool = False,
520
+ visualize_probs: bool = False,
521
+ progress_tracking_callback: Callable[[float], None] = None,
522
+ neg_threshold: float = None,
523
+ window_size_samples: int = 512,):
524
+
525
+ This method is used for splitting long audios into speech chunks using silero VAD
526
+
527
+ Parameters
528
+ ----------
529
+ audio: torch.Tensor, one dimensional
530
+ One dimensional float torch.Tensor, other types are casted to torch if possible
531
+
532
+ model: preloaded .jit/.onnx silero VAD model
533
+
534
+ threshold: float (default - 0.5)
535
+ Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
536
+ It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
537
+
538
+ sampling_rate: int (default - 16000)
539
+ Currently silero VAD models support 8000 and 16000 (or multiply of 16000) sample rates
540
+
541
+ min_speech_duration_ms: int (default - 250 milliseconds)
542
+ Final speech chunks shorter min_speech_duration_ms are thrown out
543
+
544
+ max_speech_duration_s: int (default - inf)
545
+ Maximum duration of speech chunks in seconds
546
+ Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that lasts more than 100ms (if any), to prevent agressive cutting.
547
+ Otherwise, they will be split aggressively just before max_speech_duration_s.
548
+
549
+ min_silence_duration_ms: int (default - 100 milliseconds)
550
+ In the end of each speech chunk wait for min_silence_duration_ms before separating it
551
+
552
+ speech_pad_ms: int (default - 30 milliseconds)
553
+ Final speech chunks are padded by speech_pad_ms each side
554
+
555
+ return_seconds: bool (default - False)
556
+ whether return timestamps in seconds (default - samples)
557
+
558
+ visualize_probs: bool (default - False)
559
+ whether draw prob hist or not
560
+
561
+ progress_tracking_callback: Callable[[float], None] (default - None)
562
+ callback function taking progress in percents as an argument
563
+
564
+ neg_threshold: float (default = threshold - 0.15)
565
+ Negative threshold (noise or exit threshold). If model's current state is SPEECH, values BELOW this value are considered as NON-SPEECH.
566
+
567
+ window_size_samples: int (default - 512 samples)
568
+ !!! DEPRECATED, DOES NOTHING !!!
569
+
570
+ Returns
571
+ ----------
572
+ speeches: list of dicts
573
+ list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
574
+ https://github.com/snakers4/silero-vad/blob/master/src/silero_vad/utils_vad.py
575
  """
576
  repo_owner = "snakers4"
577
+ repo_name = "silero-vad:v4.0" # https://github.com/snakers4/silero-vad/issues/515
578
  ref = "master"
579
 
580
  try:
581
+ model, utils = torch.hub.load(repo_or_dir=f'{repo_owner}/{repo_name}', model='silero_vad', trust_repo=True)
582
  except Exception as e:
583
  hub_dir = torch.hub.get_dir()
584
  owner_name_branch = '_'.join([repo_owner, repo_name, ref])
 
611
  print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
612
  wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
613
 
614
+ sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD) #, neg_threshold=0.15, return_seconds=True
615
  seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
616
  adjusted = self.adjust_timestamp(seconds_timestamps, adjust_seconds=chunk_start, max_source_time=chunk_start + chunk_duration)
617