Spaces:
Runtime error
Runtime error
katospiegel
commited on
Commit
·
98d0bf0
1
Parent(s):
e4028fa
results
Browse files
README.md
CHANGED
@@ -41,12 +41,12 @@ The user will logging using a password and user specified by me. That user and p
|
|
41 |
- [ ] Add mel spectrogram?
|
42 |
- [ ] Add Whisper parameters to the interface
|
43 |
- [x] Add Whisper X
|
44 |
-
- [
|
45 |
-
- [
|
46 |
-
- [
|
47 |
- [ ] Introduce POS.
|
48 |
-
- [
|
49 |
-
- [ ]
|
50 |
|
51 |
|
52 |
Introduce Tab for analysis including POS. Maybe it would be great to have a visualizer with the timestamps and other features in Streamlit. Quizas correcciones.
|
|
|
41 |
- [ ] Add mel spectrogram?
|
42 |
- [ ] Add Whisper parameters to the interface
|
43 |
- [x] Add Whisper X
|
44 |
+
- [x] Introduce SRT as output
|
45 |
+
- [x] Obtain txt with Diarization.
|
46 |
+
- [x] Obtain plain txt with segments.
|
47 |
- [ ] Introduce POS.
|
48 |
+
- [x] Optional Preprocessing
|
49 |
+
- [ ] Transcripcion box as the text being written.
|
50 |
|
51 |
|
52 |
Introduce Tab for analysis including POS. Maybe it would be great to have a visualizer with the timestamps and other features in Streamlit. Quizas correcciones.
|
app.py
CHANGED
@@ -38,20 +38,30 @@ def transcribe(audiofile, model, preprocesamiento):
|
|
38 |
#Archivo
|
39 |
nombre_archivo = guardar_en_archivo(out)
|
40 |
|
41 |
-
|
|
|
42 |
|
43 |
-
|
44 |
-
initial_prompt, condition_on_previous_text, temperature,
|
45 |
-
compression, logprob, no_speech_threshold):
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
audio_path = audiofile.name
|
48 |
|
49 |
-
|
|
|
50 |
|
51 |
-
|
52 |
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
55 |
|
56 |
#result = fast_transcription(vocal_path, model, "es")
|
57 |
result_whisper, result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model, language=language)
|
@@ -92,8 +102,11 @@ def transcribeWhisperX(audiofile, model, language, patience,
|
|
92 |
|
93 |
############################################################################
|
94 |
|
|
|
|
|
|
|
95 |
outputs = (audio_path, audio_normalized_path, vocal_path, novocal_path, lineas_txt_string, htmlout,
|
96 |
-
nombre_file_txt, str(file_path), guardar_dataframe_en_csv(diarize_segments), json.dumps(
|
97 |
|
98 |
return outputs
|
99 |
|
@@ -128,13 +141,14 @@ transcribeII = gr.Interface(
|
|
128 |
gr.File(label="Upload Files"),
|
129 |
gr.Radio(["base", "small", "medium", "large-v2"], label="Modelo", value="large-v2"),
|
130 |
gr.Dropdown(["Cualquiera","es","en","fr","pt"], label="Lenguaje", value="Cualquiera"),
|
131 |
-
gr.
|
132 |
-
gr.
|
133 |
-
gr.Textbox(label="
|
134 |
-
gr.
|
135 |
-
gr.Slider(minimum=0, maximum=1, label="
|
136 |
-
gr.Slider(minimum=0, maximum=1, label="
|
137 |
-
gr.Slider(minimum=0, maximum=1, label="
|
|
|
138 |
],
|
139 |
outputs=[gr.Audio(type="filepath", label="original"),
|
140 |
gr.Audio(type="filepath", label="normalized"),
|
@@ -158,13 +172,14 @@ transcribeII = gr.Interface(
|
|
158 |
examples=[["Espana 04 - Video 01 - extracto 2 min.wav",
|
159 |
"large-v2",
|
160 |
"Cualquiera",
|
161 |
-
0.5,
|
162 |
-
"",
|
163 |
-
"",
|
164 |
-
0.5,
|
165 |
-
0.5,
|
166 |
-
0.5,
|
167 |
-
0.5
|
|
|
168 |
|
169 |
)
|
170 |
|
|
|
38 |
#Archivo
|
39 |
nombre_archivo = guardar_en_archivo(out)
|
40 |
|
41 |
+
results_short = result
|
42 |
+
results_short["segments"] = results_short["segments"][0:10]
|
43 |
|
44 |
+
return audio_path, audio_normalized_path, vocal_path, novocal_path, nombre_archivo, "\n".join(out), json.dumps(results_short)
|
|
|
|
|
45 |
|
46 |
+
def transcribeWhisperX(audiofile, model, language, preprocesamiento
|
47 |
+
#patience, initial_prompt, condition_on_previous_text, temperature,
|
48 |
+
#compression, logprob, no_speech_threshold
|
49 |
+
):
|
50 |
+
|
51 |
+
#if audiofile.type is not str:
|
52 |
audio_path = audiofile.name
|
53 |
|
54 |
+
if preprocesamiento == "Pre-procesamiento del audio":
|
55 |
+
audio_normalized_path = normalizeAudio(audio_path, ".wav")
|
56 |
|
57 |
+
novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path)
|
58 |
|
59 |
+
novocal_path = mp3_to_wav(novocal_path, "novocal")
|
60 |
+
vocal_path = mp3_to_wav(vocal_path, "vocal")
|
61 |
+
else:
|
62 |
+
audio_normalized_path = audio_path
|
63 |
+
novocal_path = audio_path
|
64 |
+
vocal_path = audio_path
|
65 |
|
66 |
#result = fast_transcription(vocal_path, model, "es")
|
67 |
result_whisper, result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model, language=language)
|
|
|
102 |
|
103 |
############################################################################
|
104 |
|
105 |
+
results_short = result_speakers
|
106 |
+
results_short["segments"] = results_short["segments"][0:10]
|
107 |
+
|
108 |
outputs = (audio_path, audio_normalized_path, vocal_path, novocal_path, lineas_txt_string, htmlout,
|
109 |
+
nombre_file_txt, str(file_path), guardar_dataframe_en_csv(diarize_segments), json.dumps(results_short))
|
110 |
|
111 |
return outputs
|
112 |
|
|
|
141 |
gr.File(label="Upload Files"),
|
142 |
gr.Radio(["base", "small", "medium", "large-v2"], label="Modelo", value="large-v2"),
|
143 |
gr.Dropdown(["Cualquiera","es","en","fr","pt"], label="Lenguaje", value="Cualquiera"),
|
144 |
+
gr.Radio(["Audio Original","Pre-procesamiento del audio"], label="Mejora del audio", value="Pre-procesamiento del audio"),
|
145 |
+
#gr.Slider(minimum=0, maximum=1, label="Patience (Whisper parameter)", value=0.5, info="Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search"),
|
146 |
+
#gr.Textbox(label="Initial Prompt (Whisper parameter)", value=""),
|
147 |
+
#gr.Textbox(label="Condition on previous text (Whisper parameter)", value=""),
|
148 |
+
#gr.Slider(minimum=0, maximum=1, label="Temperature (Whisper parameter)", value=0.5, info="Temperature to use for sampling"),
|
149 |
+
#gr.Slider(minimum=0, maximum=1, label="Compression Ratio Threshold (Whisper parameter)", value=0.5),
|
150 |
+
#gr.Slider(minimum=0, maximum=1, label="Logprob Threshold (Whisper parameter)", value=0.5),
|
151 |
+
#gr.Slider(minimum=0, maximum=1, label="No Speech Threshold (Whisper parameter)", value=0.5),
|
152 |
],
|
153 |
outputs=[gr.Audio(type="filepath", label="original"),
|
154 |
gr.Audio(type="filepath", label="normalized"),
|
|
|
172 |
examples=[["Espana 04 - Video 01 - extracto 2 min.wav",
|
173 |
"large-v2",
|
174 |
"Cualquiera",
|
175 |
+
#0.5,
|
176 |
+
#"",
|
177 |
+
#"",
|
178 |
+
#0.5,
|
179 |
+
#0.5,
|
180 |
+
#0.5,
|
181 |
+
#0.5
|
182 |
+
]]
|
183 |
|
184 |
)
|
185 |
|