Update app.py
Browse files
app.py
CHANGED
@@ -41,27 +41,7 @@ def transcribe(diarise, how_diarise, audio):
|
|
41 |
y, sr = sf.read(audio)
|
42 |
print(diarise)
|
43 |
if diarise:
|
44 |
-
if how_diarise=="
|
45 |
-
print("DIARISING")
|
46 |
-
wav = read_audio(audio) # backend (sox, soundfile, or ffmpeg) required!
|
47 |
-
speech_timestamps = get_speech_timestamps(wav, model, speech_pad_ms=80, min_silence_duration_ms=150, window_size_samples=256)
|
48 |
-
print("DIARISING ENDED")
|
49 |
-
lines = []
|
50 |
-
for i, line in enumerate(speech_timestamps):
|
51 |
-
start = line['start']
|
52 |
-
start_time = str(datetime.fromtimestamp(start / sr)).split()[1]
|
53 |
-
start_time_prts = start_time.split(":")
|
54 |
-
start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',')
|
55 |
-
end = line['end']
|
56 |
-
end_time = str(datetime.fromtimestamp(end / sr)).split()[1]
|
57 |
-
end_time_prts = end_time.split(":")
|
58 |
-
end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
|
59 |
-
print(f"RECOGNISING LINE_{i} T_START{start_time} T_END{end_time}")
|
60 |
-
trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
|
61 |
-
lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n{trans}\n")
|
62 |
-
print("RECOGNISING ENDED")
|
63 |
-
print(f"LINE RESULT {trans}")
|
64 |
-
elif how_diarise=="SlowButHighQuality":
|
65 |
print("DIARISING")
|
66 |
dia = pipeline_dia(audio)
|
67 |
print("DIARISING ENDED")
|
@@ -78,11 +58,33 @@ def transcribe(diarise, how_diarise, audio):
|
|
78 |
end_time_prts = end_time.split(":")
|
79 |
end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
|
80 |
label = res[2]
|
81 |
-
print(f"RECOGNISING LINE_{i} T_START{
|
82 |
trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
|
83 |
lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n")
|
84 |
print("RECOGNISING ENDED")
|
85 |
print(f"LINE RESULT {trans}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
text = "\n".join(lines)
|
87 |
else:
|
88 |
print("RECOGNISING FULL AUDIO")
|
@@ -93,7 +95,7 @@ def transcribe(diarise, how_diarise, audio):
|
|
93 |
|
94 |
iface = gr.Interface(
|
95 |
fn=transcribe,
|
96 |
-
inputs=[gr.Checkbox(label="Diarise", info="Do you want subtitles?"), gr.Radio(["FastButLowQuality", "SlowButHighQuality", "-"], label="Diarise_Variant", info="You can choose separating on smaller pieces by faster yet low quality variant (Silero VAD), or slower
|
97 |
outputs="text",
|
98 |
title="Wav2Vec2 RuOH",
|
99 |
description=r"Realtime demo for Russian Oral History recognition using several diarizations method (Silero VAD, Pyannote) and a Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm",
|
|
|
41 |
y, sr = sf.read(audio)
|
42 |
print(diarise)
|
43 |
if diarise:
|
44 |
+
if how_diarise=="SlowButHighQuality":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
print("DIARISING")
|
46 |
dia = pipeline_dia(audio)
|
47 |
print("DIARISING ENDED")
|
|
|
58 |
end_time_prts = end_time.split(":")
|
59 |
end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
|
60 |
label = res[2]
|
61 |
+
print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt} SPEAKER_{label}")
|
62 |
trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
|
63 |
lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n")
|
64 |
print("RECOGNISING ENDED")
|
65 |
print(f"LINE RESULT {trans}")
|
66 |
+
else:
|
67 |
+
print("DIARISING")
|
68 |
+
wav = read_audio(audio) # backend (sox, soundfile, or ffmpeg) required!
|
69 |
+
speech_timestamps = get_speech_timestamps(wav, model, speech_pad_ms=80, min_silence_duration_ms=150, window_size_samples=256)
|
70 |
+
print("DIARISING ENDED")
|
71 |
+
lines = []
|
72 |
+
for i, line in enumerate(speech_timestamps):
|
73 |
+
start = line['start']
|
74 |
+
print(start)
|
75 |
+
start_time = str(datetime.fromtimestamp(start / sr)).split()[1]
|
76 |
+
start_time_prts = start_time.split(":")
|
77 |
+
start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',')
|
78 |
+
print(start_time_srt)
|
79 |
+
end = line['end']
|
80 |
+
end_time = str(datetime.fromtimestamp(end / sr)).split()[1]
|
81 |
+
end_time_prts = end_time.split(":")
|
82 |
+
end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
|
83 |
+
print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt}")
|
84 |
+
trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
|
85 |
+
lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n{trans}\n")
|
86 |
+
print("RECOGNISING ENDED")
|
87 |
+
print(f"LINE RESULT {trans}")
|
88 |
text = "\n".join(lines)
|
89 |
else:
|
90 |
print("RECOGNISING FULL AUDIO")
|
|
|
95 |
|
96 |
iface = gr.Interface(
|
97 |
fn=transcribe,
|
98 |
+
inputs=[gr.Checkbox(label="Diarise", info="Do you want subtitles?"), gr.Radio(["FastButLowQuality", "SlowButHighQuality", "-"], label="Diarise_Variant", info="You can choose separating on smaller pieces by faster yet low quality variant (Silero VAD), or slower yet high quality variant (Pyannote.Diarization, this option will detect different speakers)"), gr.Audio(type="filepath")],
|
99 |
outputs="text",
|
100 |
title="Wav2Vec2 RuOH",
|
101 |
description=r"Realtime demo for Russian Oral History recognition using several diarizations method (Silero VAD, Pyannote) and a Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm",
|