Spaces:

katospiegel
/

amanu

Runtime error

App Files Files Community

katospiegel commited on Oct 19, 2023

Commit

98d0bf0

1 Parent(s): e4028fa

results

Browse files

Files changed (2) hide show

README.md +5 -5
app.py +38 -23

README.md CHANGED Viewed

@@ -41,12 +41,12 @@ The user will logging using a password and user specified by me. That user and p
 - [ ] Add mel spectrogram?
 - [ ] Add Whisper parameters to the interface
 - [x] Add Whisper X
-- [ ] Introduce SRT as output
-- [ ] Obtain txt with Diarization.
-- [ ] Obtain plain txt with segments.
 - [ ] Introduce POS.
-- [ ] Optional Preprocessing
-- [ ] Trasncripcion box as the text being written.
 Introduce Tab for analysis including POS. Maybe it would be great to have a visualizer with the timestamps and other features in Streamlit. Quizas correcciones.

 - [ ] Add mel spectrogram?
 - [ ] Add Whisper parameters to the interface
 - [x] Add Whisper X
+- [x] Introduce SRT as output
+- [x] Obtain txt with Diarization.
+- [x] Obtain plain txt with segments.
 - [ ] Introduce POS.
+- [x] Optional Preprocessing
+- [ ] Transcripcion box as the text being written.
 Introduce Tab for analysis including POS. Maybe it would be great to have a visualizer with the timestamps and other features in Streamlit. Quizas correcciones.

app.py CHANGED Viewed

@@ -38,20 +38,30 @@ def transcribe(audiofile, model, preprocesamiento):
     #Archivo
     nombre_archivo = guardar_en_archivo(out)
-    return audio_path, audio_normalized_path, vocal_path, novocal_path, nombre_archivo, "\n".join(out), json.dumps(result)
-def transcribeWhisperX(audiofile, model, language, patience,
-                       initial_prompt, condition_on_previous_text, temperature,
-                       compression, logprob, no_speech_threshold):
     audio_path = audiofile.name
-    audio_normalized_path = normalizeAudio(audio_path, ".wav")
-    novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path)
-    novocal_path = mp3_to_wav(novocal_path, "novocal")
-    vocal_path = mp3_to_wav(vocal_path, "vocal")
     #result = fast_transcription(vocal_path, model, "es")
     result_whisper, result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model, language=language)
@@ -92,8 +102,11 @@ def transcribeWhisperX(audiofile, model, language, patience,
     ############################################################################
     outputs = (audio_path, audio_normalized_path, vocal_path, novocal_path, lineas_txt_string, htmlout,
-               nombre_file_txt, str(file_path), guardar_dataframe_en_csv(diarize_segments), json.dumps(result_speakers))
     return outputs
@@ -128,13 +141,14 @@ transcribeII = gr.Interface(
         gr.File(label="Upload Files"),
         gr.Radio(["base", "small", "medium", "large-v2"], label="Modelo", value="large-v2"),
         gr.Dropdown(["Cualquiera","es","en","fr","pt"], label="Lenguaje", value="Cualquiera"),
-        gr.Slider(minimum=0, maximum=1, label="Patience (Whisper parameter)", value=0.5, info="Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search"),
-        gr.Textbox(label="Initial Prompt (Whisper parameter)", value=""),
-        gr.Textbox(label="Condition on previous text (Whisper parameter)", value=""),
-        gr.Slider(minimum=0, maximum=1, label="Temperature (Whisper parameter)", value=0.5, info="Temperature to use for sampling"),
-        gr.Slider(minimum=0, maximum=1, label="Compression Ratio Threshold (Whisper parameter)", value=0.5),
-        gr.Slider(minimum=0, maximum=1, label="Logprob Threshold (Whisper parameter)", value=0.5),
-        gr.Slider(minimum=0, maximum=1, label="No Speech Threshold (Whisper parameter)", value=0.5),
     ],
     outputs=[gr.Audio(type="filepath", label="original"),
              gr.Audio(type="filepath", label="normalized"),
@@ -158,13 +172,14 @@ transcribeII = gr.Interface(
     examples=[["Espana 04 - Video 01 - extracto 2 min.wav",
                "large-v2",
                "Cualquiera",
-               0.5,
-               "",
-               "",
-               0.5,
-               0.5,
-               0.5,
-               0.5]]
 )

     #Archivo
     nombre_archivo = guardar_en_archivo(out)
+    results_short = result
+    results_short["segments"] = results_short["segments"][0:10]
+    return audio_path, audio_normalized_path, vocal_path, novocal_path, nombre_archivo, "\n".join(out), json.dumps(results_short)
+def transcribeWhisperX(audiofile, model, language, preprocesamiento
+                       #patience, initial_prompt, condition_on_previous_text, temperature,
+                       #compression, logprob, no_speech_threshold
+                       ):
+    #if audiofile.type is not str:
     audio_path = audiofile.name
+    if preprocesamiento == "Pre-procesamiento del audio":
+        audio_normalized_path = normalizeAudio(audio_path, ".wav")
+        novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path)
+        novocal_path = mp3_to_wav(novocal_path, "novocal")
+        vocal_path = mp3_to_wav(vocal_path, "vocal")
+    else:
+        audio_normalized_path = audio_path
+        novocal_path = audio_path
+        vocal_path = audio_path
     #result = fast_transcription(vocal_path, model, "es")
     result_whisper, result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model, language=language)
     ############################################################################
+    results_short = result_speakers
+    results_short["segments"] = results_short["segments"][0:10]
     outputs = (audio_path, audio_normalized_path, vocal_path, novocal_path, lineas_txt_string, htmlout,
+               nombre_file_txt, str(file_path), guardar_dataframe_en_csv(diarize_segments), json.dumps(results_short))
     return outputs
         gr.File(label="Upload Files"),
         gr.Radio(["base", "small", "medium", "large-v2"], label="Modelo", value="large-v2"),
         gr.Dropdown(["Cualquiera","es","en","fr","pt"], label="Lenguaje", value="Cualquiera"),
+        gr.Radio(["Audio Original","Pre-procesamiento del audio"], label="Mejora del audio", value="Pre-procesamiento del audio"),
+        #gr.Slider(minimum=0, maximum=1, label="Patience (Whisper parameter)", value=0.5, info="Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search"),
+        #gr.Textbox(label="Initial Prompt (Whisper parameter)", value=""),
+        #gr.Textbox(label="Condition on previous text (Whisper parameter)", value=""),
+        #gr.Slider(minimum=0, maximum=1, label="Temperature (Whisper parameter)", value=0.5, info="Temperature to use for sampling"),
+        #gr.Slider(minimum=0, maximum=1, label="Compression Ratio Threshold (Whisper parameter)", value=0.5),
+        #gr.Slider(minimum=0, maximum=1, label="Logprob Threshold (Whisper parameter)", value=0.5),
+        #gr.Slider(minimum=0, maximum=1, label="No Speech Threshold (Whisper parameter)", value=0.5),
     ],
     outputs=[gr.Audio(type="filepath", label="original"),
              gr.Audio(type="filepath", label="normalized"),
     examples=[["Espana 04 - Video 01 - extracto 2 min.wav",
                "large-v2",
                "Cualquiera",
+               #0.5,
+               #"",
+               #"",
+               #0.5,
+               #0.5,
+               #0.5,
+               #0.5
+               ]]
 )