Spaces:

Mihaj
/

Mihaj-wav2vec2-large-xls-r-300m-ruOH-alphav

Sleeping

App Files Files Community

Mihaj commited on Sep 24, 2024

Commit

45e3e7f

verified ·

1 Parent(s): 72e979a

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -23

app.py CHANGED Viewed

@@ -41,27 +41,7 @@ def transcribe(diarise, how_diarise, audio):
     y, sr = sf.read(audio)
     print(diarise)
     if diarise:
-        if how_diarise=="FastButLowQuality" or how_diarise=="-":
-            print("DIARISING")
-            wav = read_audio(audio) # backend (sox, soundfile, or ffmpeg) required!
-            speech_timestamps = get_speech_timestamps(wav, model, speech_pad_ms=80, min_silence_duration_ms=150, window_size_samples=256)
-            print("DIARISING ENDED")
-            lines = []
-            for i, line in enumerate(speech_timestamps):
-                start = line['start']
-                start_time = str(datetime.fromtimestamp(start / sr)).split()[1]
-                start_time_prts = start_time.split(":")
-                start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',')
-                end = line['end']
-                end_time = str(datetime.fromtimestamp(end / sr)).split()[1]
-                end_time_prts = end_time.split(":")
-                end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
-                print(f"RECOGNISING LINE_{i} T_START{start_time} T_END{end_time}")
-                trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
-                lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n{trans}\n")
-                print("RECOGNISING ENDED")
-                print(f"LINE RESULT {trans}")
-        elif how_diarise=="SlowButHighQuality":
             print("DIARISING")
             dia = pipeline_dia(audio)
             print("DIARISING ENDED")
@@ -78,11 +58,33 @@ def transcribe(diarise, how_diarise, audio):
                     end_time_prts = end_time.split(":")
                     end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
                     label = res[2]
-                    print(f"RECOGNISING LINE_{i} T_START{res[0]} T_END{res[1]} SPEAKER_{label}")
                     trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
                     lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n")
                     print("RECOGNISING ENDED")
                     print(f"LINE RESULT {trans}")
         text = "\n".join(lines)
     else:
         print("RECOGNISING FULL AUDIO")
@@ -93,7 +95,7 @@ def transcribe(diarise, how_diarise, audio):
 iface = gr.Interface(
     fn=transcribe,
-    inputs=[gr.Checkbox(label="Diarise", info="Do you want subtitles?"), gr.Radio(["FastButLowQuality", "SlowButHighQuality", "-"], label="Diarise_Variant", info="You can choose separating on smaller pieces by faster yet low quality variant (Silero VAD), or slower variant yet high quality variant (Pyannote.Diarization, this variant will detect different speakers)"), gr.Audio(type="filepath")],
     outputs="text",
     title="Wav2Vec2 RuOH",
     description=r"Realtime demo for Russian Oral History recognition using several diarizations method (Silero VAD, Pyannote) and a Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm",

     y, sr = sf.read(audio)
     print(diarise)
     if diarise:
+        if how_diarise=="SlowButHighQuality":
             print("DIARISING")
             dia = pipeline_dia(audio)
             print("DIARISING ENDED")
                     end_time_prts = end_time.split(":")
                     end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
                     label = res[2]
+                    print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt} SPEAKER_{label}")
                     trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
                     lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n")
                     print("RECOGNISING ENDED")
                     print(f"LINE RESULT {trans}")
+        else:
+            print("DIARISING")
+            wav = read_audio(audio) # backend (sox, soundfile, or ffmpeg) required!
+            speech_timestamps = get_speech_timestamps(wav, model, speech_pad_ms=80, min_silence_duration_ms=150, window_size_samples=256)
+            print("DIARISING ENDED")
+            lines = []
+            for i, line in enumerate(speech_timestamps):
+                start = line['start']
+                print(start)
+                start_time = str(datetime.fromtimestamp(start / sr)).split()[1]
+                start_time_prts = start_time.split(":")
+                start_time_srt = f"{start_time_prts[0]}:{start_time_prts[1]}:{float(start_time_prts[2]):.3f}".replace('.', ',')
+                print(start_time_srt)
+                end = line['end']
+                end_time = str(datetime.fromtimestamp(end / sr)).split()[1]
+                end_time_prts = end_time.split(":")
+                end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
+                print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt}")
+                trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
+                lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n{trans}\n")
+                print("RECOGNISING ENDED")
+                print(f"LINE RESULT {trans}")
         text = "\n".join(lines)
     else:
         print("RECOGNISING FULL AUDIO")
 iface = gr.Interface(
     fn=transcribe,
+    inputs=[gr.Checkbox(label="Diarise", info="Do you want subtitles?"), gr.Radio(["FastButLowQuality", "SlowButHighQuality", "-"], label="Diarise_Variant", info="You can choose separating on smaller pieces by faster yet low quality variant (Silero VAD), or slower yet high quality variant (Pyannote.Diarization, this option will detect different speakers)"), gr.Audio(type="filepath")],
     outputs="text",
     title="Wav2Vec2 RuOH",
     description=r"Realtime demo for Russian Oral History recognition using several diarizations method (Silero VAD, Pyannote) and a Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm",