alessandro trinca tornidor commited on
Commit
bd49a31
1 Parent(s): dc92d10

feat: play word by word with the correct TTS pronunciation

Browse files
aip_trainer/lambdas/js.py CHANGED
@@ -24,17 +24,21 @@ function updateCssText(text, letters, idxSelectedWord) {
24
  """
25
 
26
  js_play_audio = """
27
- function playAudio(text, language) {
28
  let voice_idx = 0;
29
  let voice_synth = null;
30
  let synth = window.speechSynthesis;
31
  let voice_lang;
 
 
 
 
 
32
 
33
  function setSpeech() {
34
  return new Promise(
35
  function (resolve, reject) {
36
  let id;
37
-
38
  id = setInterval(() => {
39
  if (synth.getVoices().length !== 0) {
40
  resolve(synth.getVoices());
@@ -53,9 +57,10 @@ function playAudio(text, language) {
53
  voice_lang = 'en-US';
54
  break;
55
  default:
56
- console.log(`Sorry, we are out of ${expr}.`);
57
- throw new Error(`Language ${language} not valid!`)
58
- alert(`Language ${language} not valid!`)
 
59
  }
60
 
61
  let s = setSpeech();
@@ -66,7 +71,8 @@ function playAudio(text, language) {
66
  voicesSynth = voices.filter(voice => voice.lang.startsWith(language));
67
  }
68
  if (voicesSynth.length === 0) {
69
- msg = `Error: no voice found for language ${voice_lang} / ${language}, you should use the Text-To-Speech backend feature...`
 
70
  alert(msg)
71
  throw new Error(msg)
72
  }
@@ -74,7 +80,14 @@ function playAudio(text, language) {
74
  utterThis.voice = voicesSynth[0];
75
  utterThis.rate = 0.7;
76
 
77
- synth.speak(utterThis);
 
 
 
 
 
 
 
78
  // todo: capture audio from speech synthesis to reuse on the frontend
79
  // https://stackoverflow.com/questions/45003548/how-to-capture-generated-audio-from-window-speechsynthesis-speak-call
80
  });
 
24
  """
25
 
26
  js_play_audio = """
27
+ function playAudio(text, language, sleepTime = 0) {
28
  let voice_idx = 0;
29
  let voice_synth = null;
30
  let synth = window.speechSynthesis;
31
  let voice_lang;
32
+ let sleepTimeAdditional = 500; // for some reason using this with a default input argument give an object instead of the correct number
33
+
34
+ function sleep (time) {
35
+ return new Promise((resolve) => setTimeout(resolve, time));
36
+ }
37
 
38
  function setSpeech() {
39
  return new Promise(
40
  function (resolve, reject) {
41
  let id;
 
42
  id = setInterval(() => {
43
  if (synth.getVoices().length !== 0) {
44
  resolve(synth.getVoices());
 
57
  voice_lang = 'en-US';
58
  break;
59
  default:
60
+ const msg = `Error: language ${language} not valid!`
61
+ console.error(msg);
62
+ alert(msg)
63
+ throw new Error(msg)
64
  }
65
 
66
  let s = setSpeech();
 
71
  voicesSynth = voices.filter(voice => voice.lang.startsWith(language));
72
  }
73
  if (voicesSynth.length === 0) {
74
+ const msg = `Error: no voice found for language ${voice_lang} / ${language}, you should use the Text-To-Speech backend feature...`
75
+ console.error(msg);
76
  alert(msg)
77
  throw new Error(msg)
78
  }
 
80
  utterThis.voice = voicesSynth[0];
81
  utterThis.rate = 0.7;
82
 
83
+ if (sleepTime > 0) {
84
+ sleepTime *= 1000;
85
+ sleepTime += sleepTimeAdditional;
86
+ }
87
+ console.log("start js_play_audio:: sleepTime:", sleepTime, "#")
88
+ sleep(sleepTime).then(() => {
89
+ synth.speak(utterThis);
90
+ })
91
  // todo: capture audio from speech synthesis to reuse on the frontend
92
  // https://stackoverflow.com/questions/45003548/how-to-capture-generated-audio-from-window-speechsynthesis-speak-call
93
  });
aip_trainer/lambdas/lambdaSpeechToScore.py CHANGED
@@ -164,8 +164,8 @@ def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str |
164
  num_words = len(end_time)
165
  app_logger.debug(f"start splitting recorded audio into {num_words} words...")
166
 
167
- audio_files = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
168
- output = {'audio_files': audio_files, **output}
169
  first_audio_file = audio_files[0]
170
  return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, first_audio_file, json.dumps(output)
171
 
@@ -175,36 +175,36 @@ def soundfile_write(audiofile: str | Path, data: np.ndarray, samplerate: int):
175
  sf.write(audiofile, data, samplerate)
176
 
177
 
178
- def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> str:
179
  recognition_output = json.loads(raw_json_output)
180
  list_audio_files = recognition_output["audio_files"]
181
  real_transcripts = recognition_output["real_transcripts"]
 
182
  real_transcripts_list = real_transcripts.split()
183
  app_logger.info(f"idx_recorded_word:{idx_recorded_word} ...")
184
  current_word = real_transcripts_list[idx_recorded_word]
185
- app_logger.info(f"real_transcripts, current word:{current_word} ...")
186
- return list_audio_files[idx_recorded_word]
 
 
187
 
188
 
189
- def get_audio_splitted(audiotmpfile: str | Path, text_raw_json_output_hidden: str) -> None:
190
- input_json = json.loads(text_raw_json_output_hidden)
191
- start_time = input_json["start_time"]
192
- end_time = input_json["end_time"]
193
- return get_splitted_audio_file(audiotmpfile, start_time, end_time)
194
-
195
-
196
- def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float], signal: np.ndarray = None, samplerate: int = None) -> list[str]:
197
  import soundfile as sf
198
  audio_files = []
 
 
 
199
  for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
200
- if signal is not None:
201
- audiotmpfile = sf.SoundFile(signal, samplerate=samplerate)
202
  signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
203
  audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
204
  soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
205
  app_logger.info(f"audio file {audiofile} written...")
206
  audio_files.append(str(audiofile))
207
- return audio_files
 
 
 
208
 
209
 
210
  def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> list[str]:
 
164
  num_words = len(end_time)
165
  app_logger.debug(f"start splitting recorded audio into {num_words} words...")
166
 
167
+ audio_files, audio_durations = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
168
+ output = {'audio_files': audio_files, "audio_durations": audio_durations, **output}
169
  first_audio_file = audio_files[0]
170
  return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, first_audio_file, json.dumps(output)
171
 
 
175
  sf.write(audiofile, data, samplerate)
176
 
177
 
178
+ def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> tuple[str]:
179
  recognition_output = json.loads(raw_json_output)
180
  list_audio_files = recognition_output["audio_files"]
181
  real_transcripts = recognition_output["real_transcripts"]
182
+ audio_durations = recognition_output["audio_durations"]
183
  real_transcripts_list = real_transcripts.split()
184
  app_logger.info(f"idx_recorded_word:{idx_recorded_word} ...")
185
  current_word = real_transcripts_list[idx_recorded_word]
186
+ app_logger.info(f"current word:{current_word} ...")
187
+ current_duration = audio_durations[idx_recorded_word]
188
+ app_logger.info(f"current_duration:{current_duration} ...")
189
+ return list_audio_files[idx_recorded_word], current_word, current_duration
190
 
191
 
192
+ def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float], signal: np.ndarray = None, samplerate: int = None) -> tuple[list[str], list[float]]:
 
 
 
 
 
 
 
193
  import soundfile as sf
194
  audio_files = []
195
+ if signal is not None:
196
+ audiotmpfile = sf.SoundFile(signal, samplerate=samplerate)
197
+ audio_durations = []
198
  for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
 
 
199
  signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
200
  audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
201
  soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
202
  app_logger.info(f"audio file {audiofile} written...")
203
  audio_files.append(str(audiofile))
204
+ duration = end_nth - start_nth
205
+ app_logger.info(f"audio file {audiofile} has duration {duration}...")
206
+ audio_durations.append(duration)
207
+ return audio_files, audio_durations
208
 
209
 
210
  def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> list[str]:
app.py CHANGED
@@ -15,10 +15,14 @@ css = """
15
  word_idx_text = "Selected word index"
16
 
17
 
18
- def get_textbox_empty_hidden():
 
 
19
  return gr.Textbox(visible=False)
20
 
21
- def get_number_empty_hidden():
 
 
22
  return gr.Number(visible=False)
23
 
24
  def clear():
@@ -80,6 +84,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
80
  elem_id="audio-student-recording-stt-id-element",
81
  )
82
  with gr.Row():
 
83
  with gr.Accordion("Click here to expand the table examples", open=False, elem_id="accordion-examples-id-element"):
84
  examples_text = gr.Examples(
85
  examples=[
@@ -140,10 +145,17 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
140
  show_download_button=True,
141
  elem_id="audio-splitted-student-recording-stt-id-element",
142
  )
 
143
 
144
  def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
 
145
  _transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
146
  new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
 
 
 
 
 
147
  output = {
148
  text_transcribed_hidden: _transcribed_text,
149
  text_letter_correctness: _letter_correctness,
@@ -153,7 +165,9 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
153
  text_raw_json_output_hidden: _res,
154
  num_tot_recognized_words: _num_tot_recognized_word,
155
  num_selected_recognized_word: new_num_selected_recognized_word,
156
- audio_splitted_student_recording_stt: first_audio_file
 
 
157
  }
158
  match lang:
159
  case "de":
@@ -185,7 +199,9 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
185
  num_score_en,
186
  num_tot_recognized_words,
187
  num_selected_recognized_word,
188
- audio_splitted_student_recording_stt
 
 
189
  ],
190
  )
191
 
@@ -234,7 +250,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
234
  )
235
  text_recording_ipa.change(
236
  None,
237
- inputs=[get_textbox_empty_hidden(), get_textbox_empty_hidden(), get_number_empty_hidden()],
238
  outputs=[html_output],
239
  js=js.js_update_ipa_output,
240
  )
@@ -264,7 +280,14 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
264
  num_selected_recognized_word.input(
265
  fn=lambdaSpeechToScore.get_selected_word,
266
  inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
267
- outputs=[audio_splitted_student_recording_stt],
 
 
 
 
 
 
 
268
  )
269
 
270
  @gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])
 
15
  word_idx_text = "Selected word index"
16
 
17
 
18
+ def get_textbox_hidden(text = None):
19
+ if text:
20
+ return gr.Number(value=text, visible=False)
21
  return gr.Textbox(visible=False)
22
 
23
+ def get_number_hidden(x: int = None):
24
+ if x:
25
+ return gr.Number(value=x, visible=False)
26
  return gr.Number(visible=False)
27
 
28
  def clear():
 
84
  elem_id="audio-student-recording-stt-id-element",
85
  )
86
  with gr.Row():
87
+ num_audio_duration_hidden = gr.Number(label="num_first_audio_duration", value=0, interactive=False, visible=False)
88
  with gr.Accordion("Click here to expand the table examples", open=False, elem_id="accordion-examples-id-element"):
89
  examples_text = gr.Examples(
90
  examples=[
 
145
  show_download_button=True,
146
  elem_id="audio-splitted-student-recording-stt-id-element",
147
  )
148
+ text_selected_recognized_word_hidden = gr.Textbox(label="text_selected_recognized_word", value="placeholder", interactive=False, visible=False)
149
 
150
  def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
151
+ import json
152
  _transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
153
  new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
154
+ words_list = _transcribed_text.split()
155
+ first_word = words_list[0]
156
+ json_res_loaded = json.loads(_res)
157
+ audio_durations = json_res_loaded["audio_durations"]
158
+ first_audio_duration = audio_durations[0]
159
  output = {
160
  text_transcribed_hidden: _transcribed_text,
161
  text_letter_correctness: _letter_correctness,
 
165
  text_raw_json_output_hidden: _res,
166
  num_tot_recognized_words: _num_tot_recognized_word,
167
  num_selected_recognized_word: new_num_selected_recognized_word,
168
+ audio_splitted_student_recording_stt: first_audio_file,
169
+ text_selected_recognized_word_hidden: first_word,
170
+ num_audio_duration_hidden: first_audio_duration
171
  }
172
  match lang:
173
  case "de":
 
199
  num_score_en,
200
  num_tot_recognized_words,
201
  num_selected_recognized_word,
202
+ audio_splitted_student_recording_stt,
203
+ text_selected_recognized_word_hidden,
204
+ num_audio_duration_hidden
205
  ],
206
  )
207
 
 
250
  )
251
  text_recording_ipa.change(
252
  None,
253
+ inputs=[get_textbox_hidden(), get_textbox_hidden(), get_number_hidden()],
254
  outputs=[html_output],
255
  js=js.js_update_ipa_output,
256
  )
 
280
  num_selected_recognized_word.input(
281
  fn=lambdaSpeechToScore.get_selected_word,
282
  inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
283
+ outputs=[audio_splitted_student_recording_stt, text_selected_recognized_word_hidden, num_audio_duration_hidden],
284
+ )
285
+ audio_splitted_student_recording_stt.play(
286
+ fn=None,
287
+ # text, language, sleepTime = null, prefix = null
288
+ inputs=[text_selected_recognized_word_hidden, radio_language, num_audio_duration_hidden],
289
+ outputs=audio_splitted_student_recording_stt,
290
+ js=js.js_play_audio
291
  )
292
 
293
  @gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])