Spaces:

aletrn
/

ai-pronunciation-trainer

Running

App Files Files Community

alessandro trinca tornidor commited on 25 days ago

Commit

bd49a31

•

1 Parent(s): dc92d10

feat: play word by word with the correct TTS pronunciation

Browse files

Files changed (3) hide show

aip_trainer/lambdas/js.py +20 -7
aip_trainer/lambdas/lambdaSpeechToScore.py +16 -16
app.py +29 -6

aip_trainer/lambdas/js.py CHANGED Viewed

@@ -24,17 +24,21 @@ function updateCssText(text, letters, idxSelectedWord) {
 """
 js_play_audio = """
-function playAudio(text, language) {
     let voice_idx = 0;
     let voice_synth = null;
     let synth = window.speechSynthesis;
     let voice_lang;
     function setSpeech() {
         return new Promise(
             function (resolve, reject) {
                 let id;
                 id = setInterval(() => {
                     if (synth.getVoices().length !== 0) {
                         resolve(synth.getVoices());
@@ -53,9 +57,10 @@ function playAudio(text, language) {
             voice_lang = 'en-US';
             break;
         default:
-            console.log(`Sorry, we are out of ${expr}.`);
-            throw new Error(`Language ${language} not valid!`)
-            alert(`Language ${language} not valid!`)
     }
     let s = setSpeech();
@@ -66,7 +71,8 @@ function playAudio(text, language) {
             voicesSynth = voices.filter(voice => voice.lang.startsWith(language));
         }
         if (voicesSynth.length === 0) {
-            msg = `Error: no voice found for language ${voice_lang} / ${language}, you should use the Text-To-Speech backend feature...`
             alert(msg)
             throw new Error(msg)
         }
@@ -74,7 +80,14 @@ function playAudio(text, language) {
         utterThis.voice = voicesSynth[0];
         utterThis.rate = 0.7;
-        synth.speak(utterThis);
         // todo: capture audio from speech synthesis to reuse on the frontend
         // https://stackoverflow.com/questions/45003548/how-to-capture-generated-audio-from-window-speechsynthesis-speak-call
     });

 """
 js_play_audio = """
+function playAudio(text, language, sleepTime = 0) {
     let voice_idx = 0;
     let voice_synth = null;
     let synth = window.speechSynthesis;
     let voice_lang;
+    let sleepTimeAdditional = 500; // for some reason using this with a default input argument give an object instead of the correct number
+    function sleep (time) {
+        return new Promise((resolve) => setTimeout(resolve, time));
+    }
     function setSpeech() {
         return new Promise(
             function (resolve, reject) {
                 let id;
                 id = setInterval(() => {
                     if (synth.getVoices().length !== 0) {
                         resolve(synth.getVoices());
             voice_lang = 'en-US';
             break;
         default:
+            const msg = `Error: language ${language} not valid!`
+            console.error(msg);
+            alert(msg)
+            throw new Error(msg)
     }
     let s = setSpeech();
             voicesSynth = voices.filter(voice => voice.lang.startsWith(language));
         }
         if (voicesSynth.length === 0) {
+            const msg = `Error: no voice found for language ${voice_lang} / ${language}, you should use the Text-To-Speech backend feature...`
+            console.error(msg);
             alert(msg)
             throw new Error(msg)
         }
         utterThis.voice = voicesSynth[0];
         utterThis.rate = 0.7;
+        if (sleepTime > 0) {
+            sleepTime *= 1000;
+            sleepTime += sleepTimeAdditional;
+        }
+        console.log("start js_play_audio:: sleepTime:", sleepTime, "#")
+        sleep(sleepTime).then(() => {
+            synth.speak(utterThis);
+        })
         // todo: capture audio from speech synthesis to reuse on the frontend
         // https://stackoverflow.com/questions/45003548/how-to-capture-generated-audio-from-window-speechsynthesis-speak-call
     });

aip_trainer/lambdas/lambdaSpeechToScore.py CHANGED Viewed

@@ -164,8 +164,8 @@ def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str |
     num_words = len(end_time)
     app_logger.debug(f"start splitting recorded audio into {num_words} words...")
-    audio_files = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
-    output = {'audio_files': audio_files, **output}
     first_audio_file = audio_files[0]
     return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, first_audio_file, json.dumps(output)
@@ -175,36 +175,36 @@ def soundfile_write(audiofile: str | Path, data: np.ndarray, samplerate: int):
     sf.write(audiofile, data, samplerate)
-def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> str:
     recognition_output = json.loads(raw_json_output)
     list_audio_files = recognition_output["audio_files"]
     real_transcripts = recognition_output["real_transcripts"]
     real_transcripts_list = real_transcripts.split()
     app_logger.info(f"idx_recorded_word:{idx_recorded_word} ...")
     current_word = real_transcripts_list[idx_recorded_word]
-    app_logger.info(f"real_transcripts, current word:{current_word} ...")
-    return list_audio_files[idx_recorded_word]
-def get_audio_splitted(audiotmpfile: str | Path, text_raw_json_output_hidden: str) -> None:
-    input_json = json.loads(text_raw_json_output_hidden)
-    start_time = input_json["start_time"]
-    end_time = input_json["end_time"]
-    return get_splitted_audio_file(audiotmpfile, start_time, end_time)
-def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float], signal: np.ndarray = None, samplerate: int = None) -> list[str]:
     import soundfile as sf
     audio_files = []
     for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
-        if signal is not None:
-            audiotmpfile = sf.SoundFile(signal, samplerate=samplerate)
         signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
         audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
         soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
         app_logger.info(f"audio file {audiofile} written...")
         audio_files.append(str(audiofile))
-    return audio_files
 def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> list[str]:

     num_words = len(end_time)
     app_logger.debug(f"start splitting recorded audio into {num_words} words...")
+    audio_files, audio_durations = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
+    output = {'audio_files': audio_files, "audio_durations": audio_durations, **output}
     first_audio_file = audio_files[0]
     return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, first_audio_file, json.dumps(output)
     sf.write(audiofile, data, samplerate)
+def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> tuple[str]:
     recognition_output = json.loads(raw_json_output)
     list_audio_files = recognition_output["audio_files"]
     real_transcripts = recognition_output["real_transcripts"]
+    audio_durations = recognition_output["audio_durations"]
     real_transcripts_list = real_transcripts.split()
     app_logger.info(f"idx_recorded_word:{idx_recorded_word} ...")
     current_word = real_transcripts_list[idx_recorded_word]
+    app_logger.info(f"current word:{current_word} ...")
+    current_duration = audio_durations[idx_recorded_word]
+    app_logger.info(f"current_duration:{current_duration} ...")
+    return list_audio_files[idx_recorded_word], current_word, current_duration
+def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float], signal: np.ndarray = None, samplerate: int = None) -> tuple[list[str], list[float]]:
     import soundfile as sf
     audio_files = []
+    if signal is not None:
+        audiotmpfile = sf.SoundFile(signal, samplerate=samplerate)
+    audio_durations = []
     for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
         signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
         audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
         soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
         app_logger.info(f"audio file {audiofile} written...")
         audio_files.append(str(audiofile))
+        duration = end_nth - start_nth
+        app_logger.info(f"audio file {audiofile} has duration {duration}...")
+        audio_durations.append(duration)
+    return audio_files, audio_durations
 def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> list[str]:

app.py CHANGED Viewed

@@ -15,10 +15,14 @@ css = """
 word_idx_text = "Selected word index"
-def get_textbox_empty_hidden():
     return gr.Textbox(visible=False)
-def get_number_empty_hidden():
     return gr.Number(visible=False)
 def clear():
@@ -80,6 +84,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
                     elem_id="audio-student-recording-stt-id-element",
                 )
             with gr.Row():
                 with gr.Accordion("Click here to expand the table examples", open=False, elem_id="accordion-examples-id-element"):
                     examples_text = gr.Examples(
                         examples=[
@@ -140,10 +145,17 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
                         show_download_button=True,
                         elem_id="audio-splitted-student-recording-stt-id-element",
                     )
     def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
         _transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
         new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
         output = {
             text_transcribed_hidden: _transcribed_text,
             text_letter_correctness: _letter_correctness,
@@ -153,7 +165,9 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
             text_raw_json_output_hidden: _res,
             num_tot_recognized_words: _num_tot_recognized_word,
             num_selected_recognized_word: new_num_selected_recognized_word,
-            audio_splitted_student_recording_stt: first_audio_file
         }
         match lang:
             case "de":
@@ -185,7 +199,9 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
             num_score_en,
             num_tot_recognized_words,
             num_selected_recognized_word,
-            audio_splitted_student_recording_stt
         ],
     )
@@ -234,7 +250,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
     )
     text_recording_ipa.change(
         None,
-        inputs=[get_textbox_empty_hidden(), get_textbox_empty_hidden(), get_number_empty_hidden()],
         outputs=[html_output],
         js=js.js_update_ipa_output,
     )
@@ -264,7 +280,14 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
     num_selected_recognized_word.input(
         fn=lambdaSpeechToScore.get_selected_word,
         inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
-        outputs=[audio_splitted_student_recording_stt],
     )
     @gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])

 word_idx_text = "Selected word index"
+def get_textbox_hidden(text = None):
+    if text:
+        return gr.Number(value=text, visible=False)
     return gr.Textbox(visible=False)
+def get_number_hidden(x: int = None):
+    if x:
+        return gr.Number(value=x, visible=False)
     return gr.Number(visible=False)
 def clear():
                     elem_id="audio-student-recording-stt-id-element",
                 )
             with gr.Row():
+                num_audio_duration_hidden = gr.Number(label="num_first_audio_duration", value=0, interactive=False, visible=False)
                 with gr.Accordion("Click here to expand the table examples", open=False, elem_id="accordion-examples-id-element"):
                     examples_text = gr.Examples(
                         examples=[
                         show_download_button=True,
                         elem_id="audio-splitted-student-recording-stt-id-element",
                     )
+            text_selected_recognized_word_hidden = gr.Textbox(label="text_selected_recognized_word", value="placeholder", interactive=False, visible=False)
     def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
+        import json
         _transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
         new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
+        words_list = _transcribed_text.split()
+        first_word = words_list[0]
+        json_res_loaded = json.loads(_res)
+        audio_durations = json_res_loaded["audio_durations"]
+        first_audio_duration = audio_durations[0]
         output = {
             text_transcribed_hidden: _transcribed_text,
             text_letter_correctness: _letter_correctness,
             text_raw_json_output_hidden: _res,
             num_tot_recognized_words: _num_tot_recognized_word,
             num_selected_recognized_word: new_num_selected_recognized_word,
+            audio_splitted_student_recording_stt: first_audio_file,
+            text_selected_recognized_word_hidden: first_word,
+            num_audio_duration_hidden: first_audio_duration
         }
         match lang:
             case "de":
             num_score_en,
             num_tot_recognized_words,
             num_selected_recognized_word,
+            audio_splitted_student_recording_stt,
+            text_selected_recognized_word_hidden,
+            num_audio_duration_hidden
         ],
     )
     )
     text_recording_ipa.change(
         None,
+        inputs=[get_textbox_hidden(), get_textbox_hidden(), get_number_hidden()],
         outputs=[html_output],
         js=js.js_update_ipa_output,
     )
     num_selected_recognized_word.input(
         fn=lambdaSpeechToScore.get_selected_word,
         inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
+        outputs=[audio_splitted_student_recording_stt, text_selected_recognized_word_hidden, num_audio_duration_hidden],
+    )
+    audio_splitted_student_recording_stt.play(
+        fn=None,
+        # text, language, sleepTime = null, prefix = null
+        inputs=[text_selected_recognized_word_hidden, radio_language, num_audio_duration_hidden],
+        outputs=audio_splitted_student_recording_stt,
+        js=js.js_play_audio
     )
     @gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])