Spaces:
Running
Running
laubonghaudoi
commited on
Commit
·
972e738
1
Parent(s):
d10d964
Add max_single_segment_time
Browse files
transcriber/AutoTranscriber.py
CHANGED
@@ -25,6 +25,7 @@ class AutoTranscriber:
|
|
25 |
use_denoiser=False,
|
26 |
with_punct=True,
|
27 |
offset_in_seconds=-0.25,
|
|
|
28 |
sr=16000,
|
29 |
):
|
30 |
self.corrector = corrector
|
@@ -32,9 +33,10 @@ class AutoTranscriber:
|
|
32 |
self.with_punct = with_punct
|
33 |
self.sr = sr
|
34 |
self.offset_in_seconds = offset_in_seconds
|
|
|
35 |
|
36 |
# Initialize models
|
37 |
-
self.vad_model = AutoModel(model="fsmn-vad", device=device,)
|
38 |
self.asr_model = AutoModel(
|
39 |
model="iic/SenseVoiceSmall",
|
40 |
vad_model=None, # We'll handle VAD separately
|
@@ -64,14 +66,13 @@ class AutoTranscriber:
|
|
64 |
# speech, _ = denoiser(speech, sr)
|
65 |
|
66 |
if sr != 16_000:
|
67 |
-
speech = resample(speech, sr, 16_000,
|
68 |
-
filter="kaiser_best", parallel=True)
|
69 |
|
70 |
# Get VAD segments
|
71 |
logger.info("Segmenting speech...")
|
72 |
|
73 |
start_time = time.time()
|
74 |
-
vad_results = self.vad_model.generate(input=speech)
|
75 |
logger.info("VAD took %.2f seconds", time.time() - start_time)
|
76 |
|
77 |
if not vad_results or not vad_results[0]["value"]:
|
|
|
25 |
use_denoiser=False,
|
26 |
with_punct=True,
|
27 |
offset_in_seconds=-0.25,
|
28 |
+
max_length_seconds=5,
|
29 |
sr=16000,
|
30 |
):
|
31 |
self.corrector = corrector
|
|
|
33 |
self.with_punct = with_punct
|
34 |
self.sr = sr
|
35 |
self.offset_in_seconds = offset_in_seconds
|
36 |
+
self.max_length_seconds = max_length_seconds
|
37 |
|
38 |
# Initialize models
|
39 |
+
self.vad_model = AutoModel(model="fsmn-vad", device=device, max_single_segment_time=self.max_length_seconds * 1000)
|
40 |
self.asr_model = AutoModel(
|
41 |
model="iic/SenseVoiceSmall",
|
42 |
vad_model=None, # We'll handle VAD separately
|
|
|
66 |
# speech, _ = denoiser(speech, sr)
|
67 |
|
68 |
if sr != 16_000:
|
69 |
+
speech = resample(speech, sr, 16_000, filter="kaiser_best", parallel=True)
|
|
|
70 |
|
71 |
# Get VAD segments
|
72 |
logger.info("Segmenting speech...")
|
73 |
|
74 |
start_time = time.time()
|
75 |
+
vad_results = self.vad_model.generate(input=speech, disable_pbar=True)
|
76 |
logger.info("VAD took %.2f seconds", time.time() - start_time)
|
77 |
|
78 |
if not vad_results or not vad_results[0]["value"]:
|