Spaces:
Running
Running
alessandro trinca tornidor
commited on
Commit
•
bd49a31
1
Parent(s):
dc92d10
feat: play word by word with the correct TTS pronunciation
Browse files- aip_trainer/lambdas/js.py +20 -7
- aip_trainer/lambdas/lambdaSpeechToScore.py +16 -16
- app.py +29 -6
aip_trainer/lambdas/js.py
CHANGED
@@ -24,17 +24,21 @@ function updateCssText(text, letters, idxSelectedWord) {
|
|
24 |
"""
|
25 |
|
26 |
js_play_audio = """
|
27 |
-
function playAudio(text, language) {
|
28 |
let voice_idx = 0;
|
29 |
let voice_synth = null;
|
30 |
let synth = window.speechSynthesis;
|
31 |
let voice_lang;
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
function setSpeech() {
|
34 |
return new Promise(
|
35 |
function (resolve, reject) {
|
36 |
let id;
|
37 |
-
|
38 |
id = setInterval(() => {
|
39 |
if (synth.getVoices().length !== 0) {
|
40 |
resolve(synth.getVoices());
|
@@ -53,9 +57,10 @@ function playAudio(text, language) {
|
|
53 |
voice_lang = 'en-US';
|
54 |
break;
|
55 |
default:
|
56 |
-
|
57 |
-
|
58 |
-
alert(
|
|
|
59 |
}
|
60 |
|
61 |
let s = setSpeech();
|
@@ -66,7 +71,8 @@ function playAudio(text, language) {
|
|
66 |
voicesSynth = voices.filter(voice => voice.lang.startsWith(language));
|
67 |
}
|
68 |
if (voicesSynth.length === 0) {
|
69 |
-
msg = `Error: no voice found for language ${voice_lang} / ${language}, you should use the Text-To-Speech backend feature...`
|
|
|
70 |
alert(msg)
|
71 |
throw new Error(msg)
|
72 |
}
|
@@ -74,7 +80,14 @@ function playAudio(text, language) {
|
|
74 |
utterThis.voice = voicesSynth[0];
|
75 |
utterThis.rate = 0.7;
|
76 |
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
// todo: capture audio from speech synthesis to reuse on the frontend
|
79 |
// https://stackoverflow.com/questions/45003548/how-to-capture-generated-audio-from-window-speechsynthesis-speak-call
|
80 |
});
|
|
|
24 |
"""
|
25 |
|
26 |
js_play_audio = """
|
27 |
+
function playAudio(text, language, sleepTime = 0) {
|
28 |
let voice_idx = 0;
|
29 |
let voice_synth = null;
|
30 |
let synth = window.speechSynthesis;
|
31 |
let voice_lang;
|
32 |
+
let sleepTimeAdditional = 500; // for some reason using this with a default input argument give an object instead of the correct number
|
33 |
+
|
34 |
+
function sleep (time) {
|
35 |
+
return new Promise((resolve) => setTimeout(resolve, time));
|
36 |
+
}
|
37 |
|
38 |
function setSpeech() {
|
39 |
return new Promise(
|
40 |
function (resolve, reject) {
|
41 |
let id;
|
|
|
42 |
id = setInterval(() => {
|
43 |
if (synth.getVoices().length !== 0) {
|
44 |
resolve(synth.getVoices());
|
|
|
57 |
voice_lang = 'en-US';
|
58 |
break;
|
59 |
default:
|
60 |
+
const msg = `Error: language ${language} not valid!`
|
61 |
+
console.error(msg);
|
62 |
+
alert(msg)
|
63 |
+
throw new Error(msg)
|
64 |
}
|
65 |
|
66 |
let s = setSpeech();
|
|
|
71 |
voicesSynth = voices.filter(voice => voice.lang.startsWith(language));
|
72 |
}
|
73 |
if (voicesSynth.length === 0) {
|
74 |
+
const msg = `Error: no voice found for language ${voice_lang} / ${language}, you should use the Text-To-Speech backend feature...`
|
75 |
+
console.error(msg);
|
76 |
alert(msg)
|
77 |
throw new Error(msg)
|
78 |
}
|
|
|
80 |
utterThis.voice = voicesSynth[0];
|
81 |
utterThis.rate = 0.7;
|
82 |
|
83 |
+
if (sleepTime > 0) {
|
84 |
+
sleepTime *= 1000;
|
85 |
+
sleepTime += sleepTimeAdditional;
|
86 |
+
}
|
87 |
+
console.log("start js_play_audio:: sleepTime:", sleepTime, "#")
|
88 |
+
sleep(sleepTime).then(() => {
|
89 |
+
synth.speak(utterThis);
|
90 |
+
})
|
91 |
// todo: capture audio from speech synthesis to reuse on the frontend
|
92 |
// https://stackoverflow.com/questions/45003548/how-to-capture-generated-audio-from-window-speechsynthesis-speak-call
|
93 |
});
|
aip_trainer/lambdas/lambdaSpeechToScore.py
CHANGED
@@ -164,8 +164,8 @@ def get_speech_to_score_tuple(real_text: str, file_bytes_or_audiotmpfile: str |
|
|
164 |
num_words = len(end_time)
|
165 |
app_logger.debug(f"start splitting recorded audio into {num_words} words...")
|
166 |
|
167 |
-
audio_files = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
|
168 |
-
output = {'audio_files': audio_files, **output}
|
169 |
first_audio_file = audio_files[0]
|
170 |
return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, first_audio_file, json.dumps(output)
|
171 |
|
@@ -175,36 +175,36 @@ def soundfile_write(audiofile: str | Path, data: np.ndarray, samplerate: int):
|
|
175 |
sf.write(audiofile, data, samplerate)
|
176 |
|
177 |
|
178 |
-
def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> str:
|
179 |
recognition_output = json.loads(raw_json_output)
|
180 |
list_audio_files = recognition_output["audio_files"]
|
181 |
real_transcripts = recognition_output["real_transcripts"]
|
|
|
182 |
real_transcripts_list = real_transcripts.split()
|
183 |
app_logger.info(f"idx_recorded_word:{idx_recorded_word} ...")
|
184 |
current_word = real_transcripts_list[idx_recorded_word]
|
185 |
-
app_logger.info(f"
|
186 |
-
|
|
|
|
|
187 |
|
188 |
|
189 |
-
def
|
190 |
-
input_json = json.loads(text_raw_json_output_hidden)
|
191 |
-
start_time = input_json["start_time"]
|
192 |
-
end_time = input_json["end_time"]
|
193 |
-
return get_splitted_audio_file(audiotmpfile, start_time, end_time)
|
194 |
-
|
195 |
-
|
196 |
-
def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float], signal: np.ndarray = None, samplerate: int = None) -> list[str]:
|
197 |
import soundfile as sf
|
198 |
audio_files = []
|
|
|
|
|
|
|
199 |
for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
|
200 |
-
if signal is not None:
|
201 |
-
audiotmpfile = sf.SoundFile(signal, samplerate=samplerate)
|
202 |
signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
|
203 |
audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
|
204 |
soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
|
205 |
app_logger.info(f"audio file {audiofile} written...")
|
206 |
audio_files.append(str(audiofile))
|
207 |
-
|
|
|
|
|
|
|
208 |
|
209 |
|
210 |
def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> list[str]:
|
|
|
164 |
num_words = len(end_time)
|
165 |
app_logger.debug(f"start splitting recorded audio into {num_words} words...")
|
166 |
|
167 |
+
audio_files, audio_durations = get_splitted_audio_file(audiotmpfile=file_bytes_or_audiotmpfile, start_time=start_time, end_time=end_time)
|
168 |
+
output = {'audio_files': audio_files, "audio_durations": audio_durations, **output}
|
169 |
first_audio_file = audio_files[0]
|
170 |
return real_transcripts, is_letter_correct_all_words, pronunciation_accuracy, ipa_transcript, real_transcripts_ipa, num_words, first_audio_file, json.dumps(output)
|
171 |
|
|
|
175 |
sf.write(audiofile, data, samplerate)
|
176 |
|
177 |
|
178 |
+
def get_selected_word(idx_recorded_word: int, raw_json_output: str) -> tuple[str]:
|
179 |
recognition_output = json.loads(raw_json_output)
|
180 |
list_audio_files = recognition_output["audio_files"]
|
181 |
real_transcripts = recognition_output["real_transcripts"]
|
182 |
+
audio_durations = recognition_output["audio_durations"]
|
183 |
real_transcripts_list = real_transcripts.split()
|
184 |
app_logger.info(f"idx_recorded_word:{idx_recorded_word} ...")
|
185 |
current_word = real_transcripts_list[idx_recorded_word]
|
186 |
+
app_logger.info(f"current word:{current_word} ...")
|
187 |
+
current_duration = audio_durations[idx_recorded_word]
|
188 |
+
app_logger.info(f"current_duration:{current_duration} ...")
|
189 |
+
return list_audio_files[idx_recorded_word], current_word, current_duration
|
190 |
|
191 |
|
192 |
+
def get_splitted_audio_file(audiotmpfile: str | Path, start_time: list[float], end_time: list[float], signal: np.ndarray = None, samplerate: int = None) -> tuple[list[str], list[float]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
import soundfile as sf
|
194 |
audio_files = []
|
195 |
+
if signal is not None:
|
196 |
+
audiotmpfile = sf.SoundFile(signal, samplerate=samplerate)
|
197 |
+
audio_durations = []
|
198 |
for n, (start_nth, end_nth) in enumerate(zip(start_time, end_time)):
|
|
|
|
|
199 |
signal_nth, samplerate = soundfile_load(audiotmpfile, offset=start_nth, duration=end_nth - start_nth)
|
200 |
audiofile = get_file_with_custom_suffix(audiotmpfile, f"_part{n}_start{start_nth}_end{end_nth}")
|
201 |
soundfile_write(audiofile=audiofile, data=signal_nth, samplerate=samplerate)
|
202 |
app_logger.info(f"audio file {audiofile} written...")
|
203 |
audio_files.append(str(audiofile))
|
204 |
+
duration = end_nth - start_nth
|
205 |
+
app_logger.info(f"audio file {audiofile} has duration {duration}...")
|
206 |
+
audio_durations.append(duration)
|
207 |
+
return audio_files, audio_durations
|
208 |
|
209 |
|
210 |
def get_file_with_custom_suffix(basefile: str | Path, custom_suffix: str) -> list[str]:
|
app.py
CHANGED
@@ -15,10 +15,14 @@ css = """
|
|
15 |
word_idx_text = "Selected word index"
|
16 |
|
17 |
|
18 |
-
def
|
|
|
|
|
19 |
return gr.Textbox(visible=False)
|
20 |
|
21 |
-
def
|
|
|
|
|
22 |
return gr.Number(visible=False)
|
23 |
|
24 |
def clear():
|
@@ -80,6 +84,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
80 |
elem_id="audio-student-recording-stt-id-element",
|
81 |
)
|
82 |
with gr.Row():
|
|
|
83 |
with gr.Accordion("Click here to expand the table examples", open=False, elem_id="accordion-examples-id-element"):
|
84 |
examples_text = gr.Examples(
|
85 |
examples=[
|
@@ -140,10 +145,17 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
140 |
show_download_button=True,
|
141 |
elem_id="audio-splitted-student-recording-stt-id-element",
|
142 |
)
|
|
|
143 |
|
144 |
def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
|
|
|
145 |
_transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
|
146 |
new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
|
|
|
|
|
|
|
|
|
|
|
147 |
output = {
|
148 |
text_transcribed_hidden: _transcribed_text,
|
149 |
text_letter_correctness: _letter_correctness,
|
@@ -153,7 +165,9 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
153 |
text_raw_json_output_hidden: _res,
|
154 |
num_tot_recognized_words: _num_tot_recognized_word,
|
155 |
num_selected_recognized_word: new_num_selected_recognized_word,
|
156 |
-
audio_splitted_student_recording_stt: first_audio_file
|
|
|
|
|
157 |
}
|
158 |
match lang:
|
159 |
case "de":
|
@@ -185,7 +199,9 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
185 |
num_score_en,
|
186 |
num_tot_recognized_words,
|
187 |
num_selected_recognized_word,
|
188 |
-
audio_splitted_student_recording_stt
|
|
|
|
|
189 |
],
|
190 |
)
|
191 |
|
@@ -234,7 +250,7 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
234 |
)
|
235 |
text_recording_ipa.change(
|
236 |
None,
|
237 |
-
inputs=[
|
238 |
outputs=[html_output],
|
239 |
js=js.js_update_ipa_output,
|
240 |
)
|
@@ -264,7 +280,14 @@ with gr.Blocks(css=css, head=js.head_driver_tour) as gradio_app:
|
|
264 |
num_selected_recognized_word.input(
|
265 |
fn=lambdaSpeechToScore.get_selected_word,
|
266 |
inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
|
267 |
-
outputs=[audio_splitted_student_recording_stt],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
)
|
269 |
|
270 |
@gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])
|
|
|
15 |
word_idx_text = "Selected word index"
|
16 |
|
17 |
|
18 |
+
def get_textbox_hidden(text = None):
|
19 |
+
if text:
|
20 |
+
return gr.Number(value=text, visible=False)
|
21 |
return gr.Textbox(visible=False)
|
22 |
|
23 |
+
def get_number_hidden(x: int = None):
|
24 |
+
if x:
|
25 |
+
return gr.Number(value=x, visible=False)
|
26 |
return gr.Number(visible=False)
|
27 |
|
28 |
def clear():
|
|
|
84 |
elem_id="audio-student-recording-stt-id-element",
|
85 |
)
|
86 |
with gr.Row():
|
87 |
+
num_audio_duration_hidden = gr.Number(label="num_first_audio_duration", value=0, interactive=False, visible=False)
|
88 |
with gr.Accordion("Click here to expand the table examples", open=False, elem_id="accordion-examples-id-element"):
|
89 |
examples_text = gr.Examples(
|
90 |
examples=[
|
|
|
145 |
show_download_button=True,
|
146 |
elem_id="audio-splitted-student-recording-stt-id-element",
|
147 |
)
|
148 |
+
text_selected_recognized_word_hidden = gr.Textbox(label="text_selected_recognized_word", value="placeholder", interactive=False, visible=False)
|
149 |
|
150 |
def get_updated_score_by_language(text: str, audio_rec: str | Path, lang: str, score_de: float, score_en: float):
|
151 |
+
import json
|
152 |
_transcribed_text, _letter_correctness, _pronunciation_accuracy, _recording_ipa, _ideal_ipa, _num_tot_recognized_word, first_audio_file, _res = lambdaSpeechToScore.get_speech_to_score_tuple(text, audio_rec, lang, remove_random_file=False)
|
153 |
new_num_selected_recognized_word = gr.Number(label=word_idx_text, visible=True, value=0)
|
154 |
+
words_list = _transcribed_text.split()
|
155 |
+
first_word = words_list[0]
|
156 |
+
json_res_loaded = json.loads(_res)
|
157 |
+
audio_durations = json_res_loaded["audio_durations"]
|
158 |
+
first_audio_duration = audio_durations[0]
|
159 |
output = {
|
160 |
text_transcribed_hidden: _transcribed_text,
|
161 |
text_letter_correctness: _letter_correctness,
|
|
|
165 |
text_raw_json_output_hidden: _res,
|
166 |
num_tot_recognized_words: _num_tot_recognized_word,
|
167 |
num_selected_recognized_word: new_num_selected_recognized_word,
|
168 |
+
audio_splitted_student_recording_stt: first_audio_file,
|
169 |
+
text_selected_recognized_word_hidden: first_word,
|
170 |
+
num_audio_duration_hidden: first_audio_duration
|
171 |
}
|
172 |
match lang:
|
173 |
case "de":
|
|
|
199 |
num_score_en,
|
200 |
num_tot_recognized_words,
|
201 |
num_selected_recognized_word,
|
202 |
+
audio_splitted_student_recording_stt,
|
203 |
+
text_selected_recognized_word_hidden,
|
204 |
+
num_audio_duration_hidden
|
205 |
],
|
206 |
)
|
207 |
|
|
|
250 |
)
|
251 |
text_recording_ipa.change(
|
252 |
None,
|
253 |
+
inputs=[get_textbox_hidden(), get_textbox_hidden(), get_number_hidden()],
|
254 |
outputs=[html_output],
|
255 |
js=js.js_update_ipa_output,
|
256 |
)
|
|
|
280 |
num_selected_recognized_word.input(
|
281 |
fn=lambdaSpeechToScore.get_selected_word,
|
282 |
inputs=[num_selected_recognized_word, text_raw_json_output_hidden],
|
283 |
+
outputs=[audio_splitted_student_recording_stt, text_selected_recognized_word_hidden, num_audio_duration_hidden],
|
284 |
+
)
|
285 |
+
audio_splitted_student_recording_stt.play(
|
286 |
+
fn=None,
|
287 |
+
# text, language, sleepTime = null, prefix = null
|
288 |
+
inputs=[text_selected_recognized_word_hidden, radio_language, num_audio_duration_hidden],
|
289 |
+
outputs=audio_splitted_student_recording_stt,
|
290 |
+
js=js.js_play_audio
|
291 |
)
|
292 |
|
293 |
@gradio_app.load(inputs=[local_storage], outputs=[num_score_de, num_score_en])
|