Spaces:
Running
Running
File size: 15,122 Bytes
0931910 d804881 6777887 0931910 9ab32d7 d51ffe7 3a8b45a 38f204d 3a8b45a 1bd485a 918182d bd49a31 918182d 3a8b45a bd49a31 dc92d10 3a8b45a 1470bc9 d51ffe7 183d840 5d533ea 9ab32d7 0931910 a9078f6 9ab32d7 d51ffe7 38f204d d51ffe7 a9078f6 d51ffe7 38f204d d51ffe7 38f204d d51ffe7 9ab32d7 38f204d 9ab32d7 38f204d 9ab32d7 38f204d 9ab32d7 d51ffe7 38f204d 6957865 183d840 38f204d 6957865 9ab32d7 38f204d 290bfe0 9ab32d7 1470bc9 38f204d 9ab32d7 d009a59 bd49a31 290bfe0 d009a59 d51ffe7 8b62994 3a8b45a 9ab32d7 a9078f6 9ab32d7 a9078f6 38f204d 9ab32d7 a9078f6 38f204d 9ab32d7 3a8b45a 38f204d 3a8b45a ce5c4e6 3a8b45a ce5c4e6 3a8b45a ce5c4e6 d009a59 ce5c4e6 290bfe0 dc92d10 290bfe0 d009a59 290bfe0 d009a59 bd49a31 ca7e6be bd49a31 dc92d10 bd49a31 ca7e6be 8b62994 a9078f6 918182d a9078f6 dc92d10 bd49a31 ca7e6be 918182d ca7e6be 918182d ca7e6be d009a59 ca7e6be 918182d 9ab32d7 8b62994 a9078f6 918182d a9078f6 918182d c0961d5 dc92d10 bd49a31 9ab32d7 918182d ce5c4e6 dc92d10 290bfe0 ce5c4e6 dc92d10 918182d dc92d10 918182d dc92d10 918182d dc92d10 918182d ce5c4e6 918182d bd49a31 918182d 38f204d 6957865 d51ffe7 38f204d 1470bc9 dc92d10 a9078f6 38f204d 1470bc9 38f204d d51ffe7 9ab32d7 dc92d10 9ab32d7 0931910 9ab32d7 dc92d10 bd49a31 dc92d10 5d533ea 918182d 5d533ea 918182d 5d533ea d804881 d009a59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 |
from pathlib import Path
import gradio as gr
from aip_trainer import PROJECT_ROOT_FOLDER, app_logger, sample_rate_start
from aip_trainer.lambdas import js, lambdaGetSample, lambdaSpeechToScore, lambdaTTS
css = """
.speech-output-label p {color: grey; margin-bottom: white;}
.background-white {background-color: white !important; }
.speech-output-group {padding: 12px;}
.speech-output-container {min-height: 60px;}
.speech-output-html {text-align: left; }
"""
word_idx_text = "Current word index"
def get_textbox_hidden(text = None):
if text:
return gr.Number(value=text, visible=False)
return gr.Textbox(visible=False)
def get_number_hidden(x: int = None):
if x:
return gr.Number(value=x, visible=False)
return gr.Number(visible=False)
def clear():
return None
def clear2():
return None, None
with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
local_storage = gr.BrowserState([0.0, 0.0])
app_logger.info("start gradio app building...")
project_root_folder = Path(PROJECT_ROOT_FOLDER)
with open(project_root_folder / "aip_trainer" / "lambdas" / "app_description.md", "r", encoding="utf-8") as app_description_src:
md_app_description = app_description_src.read()
gr.Markdown(md_app_description.format(sample_rate_start=sample_rate_start))
with gr.Row():
with gr.Column(scale=4, min_width=300):
with gr.Row():
with gr.Column(scale=2, min_width=80):
radio_language = gr.Radio(["de", "en"], label="Language", value="en", elem_id="radio-language-id-element")
with gr.Column(scale=5, min_width=160):
radio_difficulty = gr.Radio(
label="Difficulty",
value=0,
choices=[
("random", 0),
("easy", 1),
("medium", 2),
("hard", 3),
],
elem_id="radio-difficulty-id-element",
)
with gr.Column(scale=1, min_width=100):
btn_random_phrase = gr.Button(value="Choose a random phrase", elem_id="btn-random-phrase-id-element")
with gr.Row():
with gr.Column(scale=7, min_width=300):
text_student_transcription = gr.Textbox(
lines=3,
label="Phrase to read for speech recognition",
value="Hi there, how are you?",
elem_id="text-student-transcription-id-element",
)
with gr.Row():
audio_tts = gr.Audio(label="Audio TTS", elem_id="audio-tts-id-element")
with gr.Row():
btn_run_tts = gr.Button(value="TTS in browser", elem_id="btn-run-tts-id-element")
btn_run_tts_backend = gr.Button(value="TTS backend", elem_id="btn-run-tts-backend-id-element")
btn_clear_tts = gr.Button(value="Clear TTS backend", elem_id="btn-clear-tts-backend-id-element")
btn_clear_tts.click(clear, inputs=[], outputs=[audio_tts])
with gr.Row():
audio_student_recording_stt = gr.Audio(
label="Record a speech to evaluate",
sources=["microphone", "upload"],
type="filepath",
show_download_button=True,
elem_id="audio-student-recording-stt-id-element",
)
with gr.Row():
num_audio_duration_hidden = gr.Number(label="num_first_audio_duration", value=0, interactive=False, visible=False)
with gr.Accordion("Click here to expand the table examples", open=False, elem_id="accordion-examples-id-element"):
examples_text = gr.Examples(
examples=[
["Hallo, wie geht es dir?", "de", 1],
["Hi there, how are you?", "en", 1],
["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau.", "de", 2,],
["Rome is home to some of the most beautiful monuments in the world.", "en", 2],
["Die König-Ludwig-Eiche ist ein Naturdenkmal im Staatsbad Brückenau, einem Ortsteil des drei Kilometer nordöstlich gelegenen Bad Brückenau im Landkreis Bad Kissingen in Bayern.", "de", 3],
["Some machine learning models are designed to understand and generate human-like text based on the input they receive.", "en", 3],
],
inputs=[text_student_transcription, radio_language, radio_difficulty],
elem_id="examples-text-id-element",
)
with gr.Column(scale=4, min_width=320):
text_transcribed_hidden = gr.Textbox(
placeholder=None, label="Transcribed text", visible=False
)
text_letter_correctness = gr.Textbox(
placeholder=None,
label="Letters correctness",
visible=False,
)
text_recording_ipa = gr.Textbox(
placeholder=None, label="Student phonetic transcription", elem_id="text-student-recording-ipa-id-element"
)
text_ideal_ipa = gr.Textbox(
placeholder=None, label="Ideal phonetic transcription", elem_id="text-ideal-ipa-id-element"
)
text_raw_json_output_hidden = gr.Textbox(placeholder=None, label="text_raw_json_output_hidden", visible=False)
with gr.Group(elem_classes="speech-output-group background-white"):
gr.Markdown("Speech accuracy output", elem_classes="speech-output-label background-white")
with gr.Group(elem_classes="speech-output-container background-white"):
html_output = gr.HTML(
label="Speech accuracy output",
elem_id="speech-output",
show_label=False,
visible=True,
render=True,
value=" - ",
elem_classes="speech-output-html background-white",
)
with gr.Row():
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col1"):
num_pronunciation_accuracy = gr.Number(label="Current score %", elem_id="number-pronunciation-accuracy-id-element")
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col2"):
num_score_de = gr.Number(label="Global score DE %", value=0, interactive=False, elem_id="number-score-de-id-element")
with gr.Column(min_width=100, elem_classes="speech-accuracy-score-container row2 col3"):
num_score_en = gr.Number(label="Global score EN %", value=0, interactive=False, elem_id="number-score-en-id-element")
btn_recognize_speech_accuracy = gr.Button(value="Get speech accuracy score (%)", elem_id="btn-recognize-speech-accuracy-id-element")
with gr.Row():
num_tot_recognized_words = gr.Number(label="Total recognized words", visible=False, minimum=0, interactive=False)
with gr.Column(scale=1, min_width=50):
num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, minimum=0, value=0, interactive=False)
with gr.Column(scale=4, min_width=100):
audio_splitted_student_recording_stt = gr.Audio(
label="Splitted student speech output",
type="filepath",
show_download_button=True,
elem_id="audio-splitted-student-recording-stt-id-element",
)
text_selected_recognized_word_hidden = gr.Textbox(label="text_selected_recognized_word", value="placeholder", interactive=False, visible=False)
def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
import json
_transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
words_list = _transcribed_text.split()
first_word = words_list[0]
json_res_loaded = json.loads(_res)
audio_durations = json_res_loaded["audio_durations"]
first_audio_duration = audio_durations[0]
output = {
text_transcribed_hidden: _transcribed_text,
text_letter_correctness: _letter_correctness,
num_pronunciation_accuracy: _pronunciation_accuracy,
text_recording_ipa: _recording_ipa,
text_ideal_ipa: _ideal_ipa,
text_raw_json_output_hidden: _res,
num_tot_recognized_words: _num_tot_recognized_word,
num_selected_recognized_word: new_num_selected_recognized_word,
audio_splitted_student_recording_stt: first_audio_file,
text_selected_recognized_word_hidden: first_word,
num_audio_duration_hidden: first_audio_duration
}
match lang:
case "de":
return {
num_score_de: float(score_de) + float(_pronunciation_accuracy),
num_score_en: float(score_en),
**output
}
case "en":
return {
num_score_en: float(score_en) + float(_pronunciation_accuracy),
num_score_de: float(score_de),
**output
}
case _:
raise NotImplementedError(f"Language {lang} not supported")
btn_recognize_speech_accuracy.click(
get_updated_score_by_language,
inputs=[text_student_transcription, audio_student_recording_stt, radio_language, num_score_de, num_score_en],
outputs=[
text_transcribed_hidden,
text_letter_correctness,
num_pronunciation_accuracy,
text_recording_ipa,
text_ideal_ipa,
text_raw_json_output_hidden,
num_score_de,
num_score_en,
num_tot_recognized_words,
num_selected_recognized_word,
audio_splitted_student_recording_stt,
text_selected_recognized_word_hidden,
num_audio_duration_hidden
],
)
def change_max_selected_words(n):
app_logger.info(f"change_max_selected_words: {n} ...")
num_max_selected_words = n -1
app_logger.info(f"num_selected_recognized_words.maximum, pre: {num_selected_recognized_word.maximum} ...")
label = word_idx_text if n == 0 else f"{word_idx_text} (from 0 to {num_max_selected_words})"
interactive = n > 0
app_logger.info(f"change_max_selected_words: {n}, is interactive? {interactive} ...")
new_num_selected_recognized_words = gr.Number(label=label, visible=True, value=0, minimum=0, maximum=num_max_selected_words, interactive=interactive)
app_logger.info(f"num_selected_recognized_words.maximum, post: {num_selected_recognized_word.maximum} ...")
return new_num_selected_recognized_words
num_tot_recognized_words.change(
fn=change_max_selected_words,
inputs=[num_tot_recognized_words],
outputs=[num_selected_recognized_word],
)
def clear3():
return None, None, None, None, None, None, 0, 0, 0
text_student_transcription.change(
clear3,
inputs=[],
outputs=[
audio_student_recording_stt, audio_tts, audio_splitted_student_recording_stt, text_recording_ipa, text_ideal_ipa, text_transcribed_hidden,
num_pronunciation_accuracy, num_selected_recognized_word, num_pronunciation_accuracy
],
)
def reset_max_total_recognized_words(content_text_recording_ipa, content_num_tot_recognized_words):
if content_text_recording_ipa is None or content_text_recording_ipa == "":
app_logger.info("reset_max_total_recognized_words...")
new_num_tot_recognized_words = gr.Number(label="Total recognized words", visible=False, value=0, minimum=0, interactive=False)
return new_num_tot_recognized_words
return content_num_tot_recognized_words
text_recording_ipa.change(
reset_max_total_recognized_words,
inputs=[text_recording_ipa, num_tot_recognized_words],
outputs=[
num_tot_recognized_words
],
)
text_recording_ipa.change(
None,
inputs=[get_textbox_hidden(), get_textbox_hidden(), get_number_hidden()],
outputs=[html_output],
js=js.js_update_ipa_output,
)
btn_run_tts.click(fn=None, inputs=[text_student_transcription, radio_language], outputs=audio_tts, js=js.js_play_audio)
btn_run_tts_backend.click(
fn=lambdaTTS.get_tts,
inputs=[text_student_transcription, radio_language],
outputs=audio_tts,
)
btn_random_phrase.click(
fn=lambdaGetSample.get_random_selection,
inputs=[radio_language, radio_difficulty],
outputs=[text_student_transcription],
)
btn_random_phrase.click(
clear2,
inputs=[],
outputs=[audio_student_recording_stt, audio_tts]
)
html_output.change(
None,
inputs=[text_transcribed_hidden, text_letter_correctness, num_selected_recognized_word],
outputs=[html_output],
js=js.js_update_ipa_output,
)
num_selected_recognized_word.input(
fn=lambdaSpeechToScore.get_selected_word,
inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
outputs=[audio_splitted_student_recording_stt, text_selected_recognized_word_hidden, num_audio_duration_hidden],
)
audio_splitted_student_recording_stt.play(
fn=None,
inputs=[text_selected_recognized_word_hidden, radio_language, num_audio_duration_hidden],
outputs=audio_splitted_student_recording_stt,
js=js.js_play_audio
)
@gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])
def load_from_local_storage(saved_values):
print("loading from local storage", saved_values)
return saved_values[0], saved_values[1]
@gr.on([num_score_de.change, num_score_en.change], inputs=[num_score_de, num_score_en], outputs=[local_storage])
def save_to_local_storage(score_de, score_en):
return [score_de, score_en]
if __name__ == "__main__":
try:
gradio_app.launch()
except Exception as e:
app_logger.error(f"Error: {e}")
raise e
|