katospiegel commited on
Commit
98d0bf0
·
1 Parent(s): e4028fa
Files changed (2) hide show
  1. README.md +5 -5
  2. app.py +38 -23
README.md CHANGED
@@ -41,12 +41,12 @@ The user will logging using a password and user specified by me. That user and p
41
  - [ ] Add mel spectrogram?
42
  - [ ] Add Whisper parameters to the interface
43
  - [x] Add Whisper X
44
- - [ ] Introduce SRT as output
45
- - [ ] Obtain txt with Diarization.
46
- - [ ] Obtain plain txt with segments.
47
  - [ ] Introduce POS.
48
- - [ ] Optional Preprocessing
49
- - [ ] Trasncripcion box as the text being written.
50
 
51
 
52
  Introduce Tab for analysis including POS. Maybe it would be great to have a visualizer with the timestamps and other features in Streamlit. Quizas correcciones.
 
41
  - [ ] Add mel spectrogram?
42
  - [ ] Add Whisper parameters to the interface
43
  - [x] Add Whisper X
44
+ - [x] Introduce SRT as output
45
+ - [x] Obtain txt with Diarization.
46
+ - [x] Obtain plain txt with segments.
47
  - [ ] Introduce POS.
48
+ - [x] Optional Preprocessing
49
+ - [ ] Transcripcion box as the text being written.
50
 
51
 
52
  Introduce Tab for analysis including POS. Maybe it would be great to have a visualizer with the timestamps and other features in Streamlit. Quizas correcciones.
app.py CHANGED
@@ -38,20 +38,30 @@ def transcribe(audiofile, model, preprocesamiento):
38
  #Archivo
39
  nombre_archivo = guardar_en_archivo(out)
40
 
41
- return audio_path, audio_normalized_path, vocal_path, novocal_path, nombre_archivo, "\n".join(out), json.dumps(result)
 
42
 
43
- def transcribeWhisperX(audiofile, model, language, patience,
44
- initial_prompt, condition_on_previous_text, temperature,
45
- compression, logprob, no_speech_threshold):
46
 
 
 
 
 
 
 
47
  audio_path = audiofile.name
48
 
49
- audio_normalized_path = normalizeAudio(audio_path, ".wav")
 
50
 
51
- novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path)
52
 
53
- novocal_path = mp3_to_wav(novocal_path, "novocal")
54
- vocal_path = mp3_to_wav(vocal_path, "vocal")
 
 
 
 
55
 
56
  #result = fast_transcription(vocal_path, model, "es")
57
  result_whisper, result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model, language=language)
@@ -92,8 +102,11 @@ def transcribeWhisperX(audiofile, model, language, patience,
92
 
93
  ############################################################################
94
 
 
 
 
95
  outputs = (audio_path, audio_normalized_path, vocal_path, novocal_path, lineas_txt_string, htmlout,
96
- nombre_file_txt, str(file_path), guardar_dataframe_en_csv(diarize_segments), json.dumps(result_speakers))
97
 
98
  return outputs
99
 
@@ -128,13 +141,14 @@ transcribeII = gr.Interface(
128
  gr.File(label="Upload Files"),
129
  gr.Radio(["base", "small", "medium", "large-v2"], label="Modelo", value="large-v2"),
130
  gr.Dropdown(["Cualquiera","es","en","fr","pt"], label="Lenguaje", value="Cualquiera"),
131
- gr.Slider(minimum=0, maximum=1, label="Patience (Whisper parameter)", value=0.5, info="Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search"),
132
- gr.Textbox(label="Initial Prompt (Whisper parameter)", value=""),
133
- gr.Textbox(label="Condition on previous text (Whisper parameter)", value=""),
134
- gr.Slider(minimum=0, maximum=1, label="Temperature (Whisper parameter)", value=0.5, info="Temperature to use for sampling"),
135
- gr.Slider(minimum=0, maximum=1, label="Compression Ratio Threshold (Whisper parameter)", value=0.5),
136
- gr.Slider(minimum=0, maximum=1, label="Logprob Threshold (Whisper parameter)", value=0.5),
137
- gr.Slider(minimum=0, maximum=1, label="No Speech Threshold (Whisper parameter)", value=0.5),
 
138
  ],
139
  outputs=[gr.Audio(type="filepath", label="original"),
140
  gr.Audio(type="filepath", label="normalized"),
@@ -158,13 +172,14 @@ transcribeII = gr.Interface(
158
  examples=[["Espana 04 - Video 01 - extracto 2 min.wav",
159
  "large-v2",
160
  "Cualquiera",
161
- 0.5,
162
- "",
163
- "",
164
- 0.5,
165
- 0.5,
166
- 0.5,
167
- 0.5]]
 
168
 
169
  )
170
 
 
38
  #Archivo
39
  nombre_archivo = guardar_en_archivo(out)
40
 
41
+ results_short = result
42
+ results_short["segments"] = results_short["segments"][0:10]
43
 
44
+ return audio_path, audio_normalized_path, vocal_path, novocal_path, nombre_archivo, "\n".join(out), json.dumps(results_short)
 
 
45
 
46
+ def transcribeWhisperX(audiofile, model, language, preprocesamiento
47
+ #patience, initial_prompt, condition_on_previous_text, temperature,
48
+ #compression, logprob, no_speech_threshold
49
+ ):
50
+
51
+ #if audiofile.type is not str:
52
  audio_path = audiofile.name
53
 
54
+ if preprocesamiento == "Pre-procesamiento del audio":
55
+ audio_normalized_path = normalizeAudio(audio_path, ".wav")
56
 
57
+ novocal_path, vocal_path = separateVoiceInstrumental(audio_normalized_path)
58
 
59
+ novocal_path = mp3_to_wav(novocal_path, "novocal")
60
+ vocal_path = mp3_to_wav(vocal_path, "vocal")
61
+ else:
62
+ audio_normalized_path = audio_path
63
+ novocal_path = audio_path
64
+ vocal_path = audio_path
65
 
66
  #result = fast_transcription(vocal_path, model, "es")
67
  result_whisper, result_aligned, result_speakers, diarize_segments = doWhisperX(vocal_path, whisper_model=model, language=language)
 
102
 
103
  ############################################################################
104
 
105
+ results_short = result_speakers
106
+ results_short["segments"] = results_short["segments"][0:10]
107
+
108
  outputs = (audio_path, audio_normalized_path, vocal_path, novocal_path, lineas_txt_string, htmlout,
109
+ nombre_file_txt, str(file_path), guardar_dataframe_en_csv(diarize_segments), json.dumps(results_short))
110
 
111
  return outputs
112
 
 
141
  gr.File(label="Upload Files"),
142
  gr.Radio(["base", "small", "medium", "large-v2"], label="Modelo", value="large-v2"),
143
  gr.Dropdown(["Cualquiera","es","en","fr","pt"], label="Lenguaje", value="Cualquiera"),
144
+ gr.Radio(["Audio Original","Pre-procesamiento del audio"], label="Mejora del audio", value="Pre-procesamiento del audio"),
145
+ #gr.Slider(minimum=0, maximum=1, label="Patience (Whisper parameter)", value=0.5, info="Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search"),
146
+ #gr.Textbox(label="Initial Prompt (Whisper parameter)", value=""),
147
+ #gr.Textbox(label="Condition on previous text (Whisper parameter)", value=""),
148
+ #gr.Slider(minimum=0, maximum=1, label="Temperature (Whisper parameter)", value=0.5, info="Temperature to use for sampling"),
149
+ #gr.Slider(minimum=0, maximum=1, label="Compression Ratio Threshold (Whisper parameter)", value=0.5),
150
+ #gr.Slider(minimum=0, maximum=1, label="Logprob Threshold (Whisper parameter)", value=0.5),
151
+ #gr.Slider(minimum=0, maximum=1, label="No Speech Threshold (Whisper parameter)", value=0.5),
152
  ],
153
  outputs=[gr.Audio(type="filepath", label="original"),
154
  gr.Audio(type="filepath", label="normalized"),
 
172
  examples=[["Espana 04 - Video 01 - extracto 2 min.wav",
173
  "large-v2",
174
  "Cualquiera",
175
+ #0.5,
176
+ #"",
177
+ #"",
178
+ #0.5,
179
+ #0.5,
180
+ #0.5,
181
+ #0.5
182
+ ]]
183
 
184
  )
185