Spaces:

Mihaj
/

Mihaj-wav2vec2-large-xls-r-300m-ruOH-alphav

Sleeping

App Files Files Community

Mihaj commited on Sep 25, 2024

Commit

6a5a2f9

verified ·

1 Parent(s): 702ca95

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -13

app.py CHANGED Viewed

@@ -36,12 +36,11 @@ def preprocess(audio_path):
   print("PREPROCESSING ENDED")
   return temp_path
-def transcribe(diarise, how_diarise, audio):
     audio = preprocess(audio)
     y, sr = sf.read(audio)
-    print(diarise)
     if diarise:
-        if how_diarise=="SlowButHighQuality":
             print("DIARISING")
             dia = pipeline_dia(audio)
             print("DIARISING ENDED")
@@ -60,7 +59,13 @@ def transcribe(diarise, how_diarise, audio):
                     label = res[2]
                     print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt} SPEAKER_{label}")
                     trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
-                    lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n")
                     print("RECOGNISING ENDED")
                     print(f"LINE RESULT {trans}")
         else:
@@ -82,8 +87,15 @@ def transcribe(diarise, how_diarise, audio):
                 end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
                 print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt}")
                 trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
-                lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n{trans}\n")
                 print("RECOGNISING ENDED")
                 print(f"LINE RESULT {trans}")
         text = "\n".join(lines)
     else:
@@ -93,12 +105,27 @@ def transcribe(diarise, how_diarise, audio):
         text = res["text"]
     return text
-iface = gr.Interface(
-    fn=transcribe,
-    inputs=[gr.Checkbox(label="Diarise", info="Do you want subtitles?"), gr.Radio(["FastButLowQuality", "SlowButHighQuality", "-"], label="Diarise_Variant", info="You can choose separating on smaller pieces by faster yet low quality variant (Silero VAD), or slower yet high quality variant (Pyannote.Diarization, this option will detect different speakers)"), gr.Audio(type="filepath")],
-    outputs="text",
-    title="Wav2Vec2 RuOH",
-    description=r"Realtime demo for Russian Oral History recognition using several diarizations method (Silero VAD, Pyannote) and a Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm",
-)
-iface.launch()

   print("PREPROCESSING ENDED")
   return temp_path
+def fast_transcribe(diarise, how_diarise, translate, audio):
     audio = preprocess(audio)
     y, sr = sf.read(audio)
     if diarise:
+        if how_diarise=="Accurate":
             print("DIARISING")
             dia = pipeline_dia(audio)
             print("DIARISING ENDED")
                     label = res[2]
                     print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt} SPEAKER_{label}")
                     trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
+                    if not translate:
+                      lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n")
+                    else:
+                      print("TRANSLATION STARTED")
+                      trans_eng = translator.translate('trans', src='ru', dest="en").text
+                      print(f"TRANSLATION ENDED RESULT {trans_eng}")
+                      lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{label}] {trans}\n[{label}] {trans_eng}\n")
                     print("RECOGNISING ENDED")
                     print(f"LINE RESULT {trans}")
         else:
                 end_time_srt = f"{end_time_prts[0]}:{end_time_prts[1]}:{float(end_time_prts[2]):.3f}".replace('.', ',')
                 print(f"RECOGNISING LINE_{i} T_START {start_time_srt} T_END {end_time_srt}")
                 trans = pipe(y[start:end], chunk_length_s=10, stride_length_s=(4, 2))["text"]
                 print("RECOGNISING ENDED")
+                if not translate:
+                  lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n[{trans}\n")
+                else:
+                  print("TRANSLATION STARTED")
+                  trans_eng = translator.translate(trans, src='ru', dest="en").text
+                  print(f"TRANSLATION ENDED RESULT {trans_eng}")
+                  lines.append(f"{i+1}\n{start_time_srt} --> {end_time_srt}\n{trans}\n{trans_eng}\n")
                 print(f"LINE RESULT {trans}")
         text = "\n".join(lines)
     else:
         text = res["text"]
     return text
+with gr.Blocks() as demo:
+    gr.Markdown("""
+    #Wav2Vec2 RuOH
+    Realtime demo for Russian Oral History recognition using several diarizations method (Silero VAD, Pyannote) and a Wav2Vec large model from bond005. https://huggingface.co/bond005/wav2vec2-large-ru-golos-with-lm"
+    """)
+    with gr.Tab("Fast Translation"):
+        with gr.Row():
+          with gr.Column():
+            fast_diarize_input = gr.Checkbox(label="Subtitles", info="Do you want subtitles?")
+            fast_diarize_radio_input = gr.Radio(["Fast", "Accurate", "-"], label="separating_on_subtitles_pption", info="You can choose separating audio on smaller pieces by faster yet low quality variant (Silero VAD), or slower yet high quality variant (Pyannote.Diarization, this option will detect different speakers)")
+            fast_translate_input = gr.Checkbox(label="Translate", info="Do you want translation to English?")
+            fast_audio_input = gr.Audio(type="filepath")
+          fast_output = gr.Textbox()
+        fast_inputs = [fast_diarize_input, fast_diarize_radio_input, fast_translate_input, fast_audio_input]
+        fast_recognize_button = gr.Button("Run")
+    fast_recognize_button.click(fast_transcribe, inputs=fast_inputs, outputs=fast_output)
+if __name__ == "__main__":
+    demo.launch()