laubonghaudoi commited on
Commit
972e738
·
1 Parent(s): d10d964

Add max_single_segment_time

Browse files
Files changed (1) hide show
  1. transcriber/AutoTranscriber.py +5 -4
transcriber/AutoTranscriber.py CHANGED
@@ -25,6 +25,7 @@ class AutoTranscriber:
25
  use_denoiser=False,
26
  with_punct=True,
27
  offset_in_seconds=-0.25,
 
28
  sr=16000,
29
  ):
30
  self.corrector = corrector
@@ -32,9 +33,10 @@ class AutoTranscriber:
32
  self.with_punct = with_punct
33
  self.sr = sr
34
  self.offset_in_seconds = offset_in_seconds
 
35
 
36
  # Initialize models
37
- self.vad_model = AutoModel(model="fsmn-vad", device=device,)
38
  self.asr_model = AutoModel(
39
  model="iic/SenseVoiceSmall",
40
  vad_model=None, # We'll handle VAD separately
@@ -64,14 +66,13 @@ class AutoTranscriber:
64
  # speech, _ = denoiser(speech, sr)
65
 
66
  if sr != 16_000:
67
- speech = resample(speech, sr, 16_000,
68
- filter="kaiser_best", parallel=True)
69
 
70
  # Get VAD segments
71
  logger.info("Segmenting speech...")
72
 
73
  start_time = time.time()
74
- vad_results = self.vad_model.generate(input=speech)
75
  logger.info("VAD took %.2f seconds", time.time() - start_time)
76
 
77
  if not vad_results or not vad_results[0]["value"]:
 
25
  use_denoiser=False,
26
  with_punct=True,
27
  offset_in_seconds=-0.25,
28
+ max_length_seconds=5,
29
  sr=16000,
30
  ):
31
  self.corrector = corrector
 
33
  self.with_punct = with_punct
34
  self.sr = sr
35
  self.offset_in_seconds = offset_in_seconds
36
+ self.max_length_seconds = max_length_seconds
37
 
38
  # Initialize models
39
+ self.vad_model = AutoModel(model="fsmn-vad", device=device, max_single_segment_time=self.max_length_seconds * 1000)
40
  self.asr_model = AutoModel(
41
  model="iic/SenseVoiceSmall",
42
  vad_model=None, # We'll handle VAD separately
 
66
  # speech, _ = denoiser(speech, sr)
67
 
68
  if sr != 16_000:
69
+ speech = resample(speech, sr, 16_000, filter="kaiser_best", parallel=True)
 
70
 
71
  # Get VAD segments
72
  logger.info("Segmenting speech...")
73
 
74
  start_time = time.time()
75
+ vad_results = self.vad_model.generate(input=speech, disable_pbar=True)
76
  logger.info("VAD took %.2f seconds", time.time() - start_time)
77
 
78
  if not vad_results or not vad_results[0]["value"]: