Spaces:
Running
Running
First repository code commit
Browse files- AIModels.py +65 -0
- ModelInterfaces.py +65 -0
- README.md +24 -1
- RuleBasedModels.py +29 -0
- WordMatching.py +181 -0
- WordMetrics.py +55 -0
- data_de_en_2.pickle +3 -0
- lambdaGetSample.py +89 -0
- lambdaSpeechToScore.py +196 -0
- lambdaTTS.py +46 -0
- models.py +71 -0
- pronunciationTrainer.py +201 -0
- requirements.txt +18 -0
- static/.DS_Store +0 -0
- static/ASR_bad.wav +0 -0
- static/ASR_good.wav +0 -0
- static/ASR_okay.wav +0 -0
- static/css/.DS_Store +0 -0
- static/css/style-new.css +471 -0
- static/javascript/callbacks.js +584 -0
- templates/.DS_Store +0 -0
- templates/main.html +181 -0
- unitTests.py +99 -0
- utilsFileIO.py +9 -0
- webApp.py +47 -0
AIModels.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ModelInterfaces
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class NeuralASR(ModelInterfaces.IASRModel):
|
7 |
+
word_locations_in_samples = None
|
8 |
+
audio_transcript = None
|
9 |
+
|
10 |
+
def __init__(self, model: torch.nn.Module, decoder) -> None:
|
11 |
+
super().__init__()
|
12 |
+
self.model = model
|
13 |
+
self.decoder = decoder # Decoder from CTC-outputs to transcripts
|
14 |
+
|
15 |
+
def getTranscript(self) -> str:
|
16 |
+
"""Get the transcripts of the process audio"""
|
17 |
+
assert(self.audio_transcript != None,
|
18 |
+
'Can get audio transcripts without having processed the audio')
|
19 |
+
return self.audio_transcript
|
20 |
+
|
21 |
+
def getWordLocations(self) -> list:
|
22 |
+
"""Get the pair of words location from audio"""
|
23 |
+
assert(self.word_locations_in_samples != None,
|
24 |
+
'Can get word locations without having processed the audio')
|
25 |
+
|
26 |
+
return self.word_locations_in_samples
|
27 |
+
|
28 |
+
def processAudio(self, audio: torch.Tensor):
|
29 |
+
"""Process the audio"""
|
30 |
+
audio_length_in_samples = audio.shape[1]
|
31 |
+
with torch.inference_mode():
|
32 |
+
nn_output = self.model(audio)
|
33 |
+
|
34 |
+
self.audio_transcript, self.word_locations_in_samples = self.decoder(
|
35 |
+
nn_output[0, :, :].detach(), audio_length_in_samples, word_align=True)
|
36 |
+
|
37 |
+
|
38 |
+
class NeuralTTS(ModelInterfaces.ITextToSpeechModel):
|
39 |
+
def __init__(self, model: torch.nn.Module, sampling_rate: int) -> None:
|
40 |
+
super().__init__()
|
41 |
+
self.model = model
|
42 |
+
self.sampling_rate = sampling_rate
|
43 |
+
|
44 |
+
def getAudioFromSentence(self, sentence: str) -> np.array:
|
45 |
+
with torch.inference_mode():
|
46 |
+
audio_transcript = self.model.apply_tts(texts=[sentence],
|
47 |
+
sample_rate=self.sampling_rate)[0]
|
48 |
+
|
49 |
+
return audio_transcript
|
50 |
+
|
51 |
+
|
52 |
+
class NeuralTranslator(ModelInterfaces.ITranslationModel):
|
53 |
+
def __init__(self, model: torch.nn.Module, tokenizer) -> None:
|
54 |
+
super().__init__()
|
55 |
+
self.model = model
|
56 |
+
self.tokenizer = tokenizer
|
57 |
+
|
58 |
+
def translateSentence(self, sentence: str) -> str:
|
59 |
+
"""Get the transcripts of the process audio"""
|
60 |
+
tokenized_text = self.tokenizer(sentence, return_tensors='pt')
|
61 |
+
translation = self.model.generate(**tokenized_text)
|
62 |
+
translated_text = self.tokenizer.batch_decode(
|
63 |
+
translation, skip_special_tokens=True)[0]
|
64 |
+
|
65 |
+
return translated_text
|
ModelInterfaces.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import abc
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class IASRModel(metaclass=abc.ABCMeta):
|
7 |
+
@classmethod
|
8 |
+
def __subclasshook__(cls, subclass):
|
9 |
+
return (hasattr(subclass, 'getTranscript') and
|
10 |
+
callable(subclass.getTranscript) and
|
11 |
+
hasattr(subclass, 'getWordLocations') and
|
12 |
+
callable(subclass.getWordLocations) and
|
13 |
+
hasattr(subclass, 'processAudio') and
|
14 |
+
callable(subclass.processAudio))
|
15 |
+
|
16 |
+
@abc.abstractmethod
|
17 |
+
def getTranscript(self) -> str:
|
18 |
+
"""Get the transcripts of the process audio"""
|
19 |
+
raise NotImplementedError
|
20 |
+
|
21 |
+
@abc.abstractmethod
|
22 |
+
def getWordLocations(self) -> list:
|
23 |
+
"""Get the pair of words location from audio"""
|
24 |
+
raise NotImplementedError
|
25 |
+
|
26 |
+
@abc.abstractmethod
|
27 |
+
def processAudio(self, audio):
|
28 |
+
"""Process the audio"""
|
29 |
+
raise NotImplementedError
|
30 |
+
|
31 |
+
|
32 |
+
class ITranslationModel(metaclass=abc.ABCMeta):
|
33 |
+
@classmethod
|
34 |
+
def __subclasshook__(cls, subclass):
|
35 |
+
return (hasattr(subclass, 'translateSentence') and
|
36 |
+
callable(subclass.translateSentence))
|
37 |
+
|
38 |
+
@abc.abstractmethod
|
39 |
+
def translateSentence(self, str) -> str:
|
40 |
+
"""Get the translation of the sentence"""
|
41 |
+
raise NotImplementedError
|
42 |
+
|
43 |
+
|
44 |
+
class ITextToSpeechModel(metaclass=abc.ABCMeta):
|
45 |
+
@classmethod
|
46 |
+
def __subclasshook__(cls, subclass):
|
47 |
+
return (hasattr(subclass, 'getAudioFromSentence') and
|
48 |
+
callable(subclass.getAudioFromSentence))
|
49 |
+
|
50 |
+
@abc.abstractmethod
|
51 |
+
def getAudioFromSentence(self, str) -> np.array:
|
52 |
+
"""Get audio from sentence"""
|
53 |
+
raise NotImplementedError
|
54 |
+
|
55 |
+
|
56 |
+
class ITextToPhonemModel(metaclass=abc.ABCMeta):
|
57 |
+
@classmethod
|
58 |
+
def __subclasshook__(cls, subclass):
|
59 |
+
return (hasattr(subclass, 'convertToPhonem') and
|
60 |
+
callable(subclass.convertToPhonem))
|
61 |
+
|
62 |
+
@abc.abstractmethod
|
63 |
+
def convertToPhonem(self, str) -> str:
|
64 |
+
"""Convert sentence to phonemes"""
|
65 |
+
raise NotImplementedError
|
README.md
CHANGED
@@ -1 +1,24 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Motivation
|
2 |
+
This tool uses AI to evaluate your pronunciation so you can be understood more clearly. To directly test the tool, go to https://aipronunciationtr.com (please use the chrome browser for desktop).
|
3 |
+
|
4 |
+
Often, when we want to improve our pronunciation, it is very difficult to self-assess how good we’re speaking. Asking a native, or language instructor, to constantly correct us is either impractical, due to monetary constrains, or annoying due to simply being too boring for this other person. Additionally, they may often say “it sounds okay” after your 10th try to not discourage you, even though you may still have some mistakes in your pronunciation.
|
5 |
+
|
6 |
+
The AI pronunciation trainer is a way to provide objective feedback on how well your pronunciation is in an automatic and scalable fashion, so the only limit to your improvement is your own dedication.
|
7 |
+
|
8 |
+
This project originated from a small program that I did to improve my own pronunciation. When I finished it, I believed it could be a useful tool also to other people trying to be better understood, so I decided to make a simple, more user friendly version of it.
|
9 |
+
|
10 |
+
## Installation
|
11 |
+
To run the program, all you need to do is to install the requirements and run the main python file
|
12 |
+
```
|
13 |
+
pip install -r requirements.txt
|
14 |
+
python webApp.py
|
15 |
+
```
|
16 |
+
The code is pure python, thus you should be able to run it without any major issue as long as you’re a using a recent python 3.X version.
|
17 |
+
|
18 |
+
## Online version
|
19 |
+
For the people who don’t feel comfortable running code or just want to have a quick way to use the tool, I hosted an online version of it at https://aipronunciationtr.com . It should work without any major issues in desktop-chrome, any other browser is not officially supported, although most of the functionality should work fine.
|
20 |
+
|
21 |
+
Please be aware that the usage is limited by day (I’m still not rich ;) ). If, for some reason, you would like to skip the daily usage limit, just enter in contact and we can negotiate an additional API key only for you.
|
22 |
+
|
23 |
+
## Disclaimer
|
24 |
+
Even though the tool can be useful, my intention here was not to make an industry-grade program with 100% test coverage, no errors and full support, but rather some easy to use tool that may be useful for some people even though it may not be perfect. Thus, be aware that some small bugs may be presented. In case you find something not working, all feedback is welcome and issues may be adressed depending on their severity.
|
RuleBasedModels.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ModelInterfaces
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
import epitran
|
5 |
+
import eng_to_ipa
|
6 |
+
|
7 |
+
|
8 |
+
class EpitranPhonemConverter(ModelInterfaces.ITextToPhonemModel):
|
9 |
+
word_locations_in_samples = None
|
10 |
+
audio_transcript = None
|
11 |
+
|
12 |
+
def __init__(self, epitran_model) -> None:
|
13 |
+
super().__init__()
|
14 |
+
self.epitran_model = epitran_model
|
15 |
+
|
16 |
+
def convertToPhonem(self, sentence: str) -> str:
|
17 |
+
phonem_representation = self.epitran_model.transliterate(sentence)
|
18 |
+
return phonem_representation
|
19 |
+
|
20 |
+
|
21 |
+
class EngPhonemConverter(ModelInterfaces.ITextToPhonemModel):
|
22 |
+
|
23 |
+
def __init__(self,) -> None:
|
24 |
+
super().__init__()
|
25 |
+
|
26 |
+
def convertToPhonem(self, sentence: str) -> str:
|
27 |
+
phonem_representation = eng_to_ipa.convert(sentence)
|
28 |
+
phonem_representation = phonem_representation.replace('*','')
|
29 |
+
return phonem_representation
|
WordMatching.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import WordMetrics
|
2 |
+
from ortools.sat.python import cp_model
|
3 |
+
import numpy as np
|
4 |
+
from string import punctuation
|
5 |
+
from dtwalign import dtw_from_distance_matrix
|
6 |
+
import time
|
7 |
+
|
8 |
+
offset_blank = 1
|
9 |
+
TIME_THRESHOLD_MAPPING = 5.0
|
10 |
+
|
11 |
+
|
12 |
+
def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.array:
|
13 |
+
number_of_real_words = len(words_real)
|
14 |
+
number_of_estimated_words = len(words_estimated)
|
15 |
+
|
16 |
+
word_distance_matrix = np.zeros(
|
17 |
+
(number_of_estimated_words+offset_blank, number_of_real_words))
|
18 |
+
for idx_estimated in range(number_of_estimated_words):
|
19 |
+
for idx_real in range(number_of_real_words):
|
20 |
+
word_distance_matrix[idx_estimated, idx_real] = WordMetrics.edit_distance_python(
|
21 |
+
words_estimated[idx_estimated], words_real[idx_real])
|
22 |
+
|
23 |
+
if offset_blank == 1:
|
24 |
+
for idx_real in range(number_of_real_words):
|
25 |
+
word_distance_matrix[number_of_estimated_words,
|
26 |
+
idx_real] = len(words_real[idx_real])
|
27 |
+
return word_distance_matrix
|
28 |
+
|
29 |
+
|
30 |
+
def get_best_path_from_distance_matrix(word_distance_matrix):
|
31 |
+
modelCpp = cp_model.CpModel()
|
32 |
+
|
33 |
+
number_of_real_words = word_distance_matrix.shape[1]
|
34 |
+
number_of_estimated_words = word_distance_matrix.shape[0]-1
|
35 |
+
|
36 |
+
number_words = np.maximum(number_of_real_words, number_of_estimated_words)
|
37 |
+
|
38 |
+
estimated_words_order = [modelCpp.NewIntVar(0, int(
|
39 |
+
number_words - 1 + offset_blank), 'w%i' % i) for i in range(number_words+offset_blank)]
|
40 |
+
|
41 |
+
# They are in ascending order
|
42 |
+
for word_idx in range(number_words-1):
|
43 |
+
modelCpp.Add(
|
44 |
+
estimated_words_order[word_idx+1] >= estimated_words_order[word_idx])
|
45 |
+
|
46 |
+
total_phoneme_distance = 0
|
47 |
+
real_word_at_time = {}
|
48 |
+
for idx_estimated in range(number_of_estimated_words):
|
49 |
+
for idx_real in range(number_of_real_words):
|
50 |
+
real_word_at_time[idx_estimated, idx_real] = modelCpp.NewBoolVar(
|
51 |
+
'real_word_at_time'+str(idx_real)+'-'+str(idx_estimated))
|
52 |
+
modelCpp.Add(estimated_words_order[idx_estimated] == idx_real).OnlyEnforceIf(
|
53 |
+
real_word_at_time[idx_estimated, idx_real])
|
54 |
+
total_phoneme_distance += word_distance_matrix[idx_estimated,
|
55 |
+
idx_real]*real_word_at_time[idx_estimated, idx_real]
|
56 |
+
|
57 |
+
# If no word in time, difference is calculated from empty string
|
58 |
+
for idx_real in range(number_of_real_words):
|
59 |
+
word_has_a_match = modelCpp.NewBoolVar(
|
60 |
+
'word_has_a_match'+str(idx_real))
|
61 |
+
modelCpp.Add(sum([real_word_at_time[idx_estimated, idx_real] for idx_estimated in range(
|
62 |
+
number_of_estimated_words)]) == 1).OnlyEnforceIf(word_has_a_match)
|
63 |
+
total_phoneme_distance += word_distance_matrix[number_of_estimated_words,
|
64 |
+
idx_real]*word_has_a_match.Not()
|
65 |
+
|
66 |
+
# Loss should be minimized
|
67 |
+
modelCpp.Minimize(total_phoneme_distance)
|
68 |
+
|
69 |
+
solver = cp_model.CpSolver()
|
70 |
+
solver.parameters.max_time_in_seconds = TIME_THRESHOLD_MAPPING
|
71 |
+
status = solver.Solve(modelCpp)
|
72 |
+
|
73 |
+
mapped_indices = []
|
74 |
+
try:
|
75 |
+
for word_idx in range(number_words):
|
76 |
+
mapped_indices.append(
|
77 |
+
(solver.Value(estimated_words_order[word_idx])))
|
78 |
+
|
79 |
+
return np.array(mapped_indices, dtype=np.int)
|
80 |
+
except:
|
81 |
+
return []
|
82 |
+
|
83 |
+
|
84 |
+
def get_resulting_string(mapped_indices: np.array, words_estimated: list, words_real: list) -> list:
|
85 |
+
mapped_words = []
|
86 |
+
mapped_words_indices = []
|
87 |
+
WORD_NOT_FOUND_TOKEN = '-'
|
88 |
+
number_of_real_words = len(words_real)
|
89 |
+
for word_idx in range(number_of_real_words):
|
90 |
+
position_of_real_word_indices = np.where(
|
91 |
+
mapped_indices == word_idx)[0].astype(np.int)
|
92 |
+
|
93 |
+
if len(position_of_real_word_indices) == 0:
|
94 |
+
mapped_words.append(WORD_NOT_FOUND_TOKEN)
|
95 |
+
mapped_words_indices.append(-1)
|
96 |
+
continue
|
97 |
+
|
98 |
+
if len(position_of_real_word_indices) == 1:
|
99 |
+
mapped_words.append(
|
100 |
+
words_estimated[position_of_real_word_indices[0]])
|
101 |
+
mapped_words_indices.append(position_of_real_word_indices[0])
|
102 |
+
continue
|
103 |
+
# Check which index gives the lowest error
|
104 |
+
if len(position_of_real_word_indices) > 1:
|
105 |
+
error = 99999
|
106 |
+
best_possible_combination = ''
|
107 |
+
best_possible_idx = -1
|
108 |
+
for single_word_idx in position_of_real_word_indices:
|
109 |
+
idx_above_word = single_word_idx >= len(words_estimated)
|
110 |
+
if idx_above_word:
|
111 |
+
continue
|
112 |
+
error_word = WordMetrics.edit_distance_python(
|
113 |
+
words_estimated[single_word_idx], words_real[word_idx])
|
114 |
+
if error_word < error:
|
115 |
+
error = error_word*1
|
116 |
+
best_possible_combination = words_estimated[single_word_idx]
|
117 |
+
best_possible_idx = single_word_idx
|
118 |
+
|
119 |
+
mapped_words.append(best_possible_combination)
|
120 |
+
mapped_words_indices.append(best_possible_idx)
|
121 |
+
continue
|
122 |
+
|
123 |
+
return mapped_words, mapped_words_indices
|
124 |
+
|
125 |
+
|
126 |
+
def get_best_mapped_words(words_estimated: list, words_real: list) -> list:
|
127 |
+
|
128 |
+
word_distance_matrix = get_word_distance_matrix(
|
129 |
+
words_estimated, words_real)
|
130 |
+
|
131 |
+
start = time.time()
|
132 |
+
mapped_indices = get_best_path_from_distance_matrix(word_distance_matrix)
|
133 |
+
|
134 |
+
duration_of_mapping = time.time()-start
|
135 |
+
# In case or-tools doesn't converge, go to a faster, low-quality solution
|
136 |
+
if len(mapped_indices) == 0 or duration_of_mapping > TIME_THRESHOLD_MAPPING+0.5:
|
137 |
+
mapped_indices = (dtw_from_distance_matrix(
|
138 |
+
word_distance_matrix)).path[:len(words_estimated), 1]
|
139 |
+
|
140 |
+
mapped_words, mapped_words_indices = get_resulting_string(
|
141 |
+
mapped_indices, words_estimated, words_real)
|
142 |
+
|
143 |
+
return mapped_words, mapped_words_indices
|
144 |
+
|
145 |
+
|
146 |
+
# Faster, but not optimal
|
147 |
+
def get_best_mapped_words_dtw(words_estimated: list, words_real: list) -> list:
|
148 |
+
|
149 |
+
from dtwalign import dtw_from_distance_matrix
|
150 |
+
word_distance_matrix = get_word_distance_matrix(
|
151 |
+
words_estimated, words_real)
|
152 |
+
mapped_indices = dtw_from_distance_matrix(
|
153 |
+
word_distance_matrix).path[:-1, 0]
|
154 |
+
|
155 |
+
mapped_words, mapped_words_indices = get_resulting_string(
|
156 |
+
mapped_indices, words_estimated, words_real)
|
157 |
+
return mapped_words, mapped_words_indices
|
158 |
+
|
159 |
+
|
160 |
+
def getWhichLettersWereTranscribedCorrectly(real_word, transcribed_word):
|
161 |
+
is_leter_correct = [None]*len(real_word)
|
162 |
+
for idx, letter in enumerate(real_word):
|
163 |
+
if letter == transcribed_word[idx] or letter in punctuation:
|
164 |
+
is_leter_correct[idx] = 1
|
165 |
+
else:
|
166 |
+
is_leter_correct[idx] = 0
|
167 |
+
return is_leter_correct
|
168 |
+
|
169 |
+
|
170 |
+
def parseLetterErrorsToHTML(word_real, is_leter_correct):
|
171 |
+
word_colored = ''
|
172 |
+
correct_color_start = '*'
|
173 |
+
correct_color_end = '*'
|
174 |
+
wrong_color_start = '-'
|
175 |
+
wrong_color_end = '-'
|
176 |
+
for idx, letter in enumerate(word_real):
|
177 |
+
if is_leter_correct[idx] == 1:
|
178 |
+
word_colored += correct_color_start + letter+correct_color_end
|
179 |
+
else:
|
180 |
+
word_colored += wrong_color_start + letter+wrong_color_end
|
181 |
+
return word_colored
|
WordMetrics.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
# ref from https://gitlab.com/-/snippets/1948157
|
4 |
+
# For some variants, look here https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
|
5 |
+
|
6 |
+
# Pure python
|
7 |
+
def edit_distance_python2(a, b):
|
8 |
+
# This version is commutative, so as an optimization we force |a|>=|b|
|
9 |
+
if len(a) < len(b):
|
10 |
+
return edit_distance_python(b, a)
|
11 |
+
if len(b) == 0: # Can deal with empty sequences faster
|
12 |
+
return len(a)
|
13 |
+
# Only two rows are really needed: the one currently filled in, and the previous
|
14 |
+
distances = []
|
15 |
+
distances.append([i for i in range(len(b)+1)])
|
16 |
+
distances.append([0 for _ in range(len(b)+1)])
|
17 |
+
# We can prefill the first row:
|
18 |
+
costs = [0 for _ in range(3)]
|
19 |
+
for i, a_token in enumerate(a, start=1):
|
20 |
+
distances[1][0] += 1 # Deals with the first column.
|
21 |
+
for j, b_token in enumerate(b, start=1):
|
22 |
+
costs[0] = distances[1][j-1] + 1
|
23 |
+
costs[1] = distances[0][j] + 1
|
24 |
+
costs[2] = distances[0][j-1] + (0 if a_token == b_token else 1)
|
25 |
+
distances[1][j] = min(costs)
|
26 |
+
# Move to the next row:
|
27 |
+
distances[0][:] = distances[1][:]
|
28 |
+
return distances[1][len(b)]
|
29 |
+
|
30 |
+
#https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/
|
31 |
+
def edit_distance_python(seq1, seq2):
|
32 |
+
size_x = len(seq1) + 1
|
33 |
+
size_y = len(seq2) + 1
|
34 |
+
matrix = np.zeros ((size_x, size_y))
|
35 |
+
for x in range(size_x):
|
36 |
+
matrix [x, 0] = x
|
37 |
+
for y in range(size_y):
|
38 |
+
matrix [0, y] = y
|
39 |
+
|
40 |
+
for x in range(1, size_x):
|
41 |
+
for y in range(1, size_y):
|
42 |
+
if seq1[x-1] == seq2[y-1]:
|
43 |
+
matrix [x,y] = min(
|
44 |
+
matrix[x-1, y] + 1,
|
45 |
+
matrix[x-1, y-1],
|
46 |
+
matrix[x, y-1] + 1
|
47 |
+
)
|
48 |
+
else:
|
49 |
+
matrix [x,y] = min(
|
50 |
+
matrix[x-1,y] + 1,
|
51 |
+
matrix[x-1,y-1] + 1,
|
52 |
+
matrix[x,y-1] + 1
|
53 |
+
)
|
54 |
+
#print (matrix)
|
55 |
+
return (matrix[size_x - 1, size_y - 1])
|
data_de_en_2.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:30ec872918777a5c766ab26d78f33383f6b36fccb00af7fc5c543bd43c98ffa4
|
3 |
+
size 1056086
|
lambdaGetSample.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
import RuleBasedModels
|
5 |
+
import epitran
|
6 |
+
import random
|
7 |
+
import pickle
|
8 |
+
|
9 |
+
|
10 |
+
class TextDataset():
|
11 |
+
def __init__(self, table, language='-'):
|
12 |
+
self.table_dataframe = table
|
13 |
+
self.number_of_samples = len(table)
|
14 |
+
self.language = language
|
15 |
+
|
16 |
+
def __getitem__(self, idx):
|
17 |
+
|
18 |
+
if self.language == 'de':
|
19 |
+
line = [self.table_dataframe['de_sentence'].iloc[idx]]
|
20 |
+
elif self.language == 'en':
|
21 |
+
line = [self.table_dataframe['en_sentence'].iloc[idx]]
|
22 |
+
else:
|
23 |
+
line = [self.table_dataframe['sentence'].iloc[idx]]
|
24 |
+
return line
|
25 |
+
|
26 |
+
def __len__(self):
|
27 |
+
return self.number_of_samples
|
28 |
+
|
29 |
+
|
30 |
+
sample_folder = "./"
|
31 |
+
lambda_database = {}
|
32 |
+
lambda_ipa_converter = {}
|
33 |
+
|
34 |
+
with open(sample_folder+'data_de_en_2.pickle', 'rb') as handle:
|
35 |
+
df = pickle.load(handle)
|
36 |
+
|
37 |
+
lambda_database['de'] = TextDataset(df, 'de')
|
38 |
+
lambda_database['en'] = TextDataset(df, 'en')
|
39 |
+
lambda_translate_new_sample = False
|
40 |
+
lambda_ipa_converter['de'] = RuleBasedModels.EpitranPhonemConverter(
|
41 |
+
epitran.Epitran('deu-Latn'))
|
42 |
+
lambda_ipa_converter['en'] = RuleBasedModels.EngPhonemConverter()
|
43 |
+
|
44 |
+
|
45 |
+
def lambda_handler(event, context):
|
46 |
+
|
47 |
+
body = json.loads(event['body'])
|
48 |
+
|
49 |
+
category = int(body['category'])
|
50 |
+
|
51 |
+
language = body['language']
|
52 |
+
|
53 |
+
sample_in_category = False
|
54 |
+
|
55 |
+
while(not sample_in_category):
|
56 |
+
valid_sequence = False
|
57 |
+
while not valid_sequence:
|
58 |
+
try:
|
59 |
+
sample_idx = random.randint(0, len(lambda_database[language]))
|
60 |
+
current_transcript = lambda_database[language][
|
61 |
+
sample_idx]
|
62 |
+
valid_sequence = True
|
63 |
+
except:
|
64 |
+
pass
|
65 |
+
|
66 |
+
sentence_category = getSentenceCategory(
|
67 |
+
current_transcript[0])
|
68 |
+
|
69 |
+
sample_in_category = (sentence_category ==
|
70 |
+
category) or category == 0
|
71 |
+
|
72 |
+
translated_trascript = ""
|
73 |
+
|
74 |
+
current_ipa = lambda_ipa_converter[language].convertToPhonem(
|
75 |
+
current_transcript[0])
|
76 |
+
|
77 |
+
result = {'real_transcript': current_transcript,
|
78 |
+
'ipa_transcript': current_ipa,
|
79 |
+
'transcript_translation': translated_trascript}
|
80 |
+
|
81 |
+
return json.dumps(result)
|
82 |
+
|
83 |
+
|
84 |
+
def getSentenceCategory(sentence) -> int:
|
85 |
+
number_of_words = len(sentence.split())
|
86 |
+
categories_word_limits = [0, 8, 20, 100000]
|
87 |
+
for category in range(len(categories_word_limits)-1):
|
88 |
+
if number_of_words > categories_word_limits[category] and number_of_words <= categories_word_limits[category+1]:
|
89 |
+
return category+1
|
lambdaSpeechToScore.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
import WordMatching as wm
|
6 |
+
import utilsFileIO
|
7 |
+
import pronunciationTrainer
|
8 |
+
import base64
|
9 |
+
import time
|
10 |
+
import audioread
|
11 |
+
import numpy as np
|
12 |
+
from torchaudio.transforms import Resample
|
13 |
+
|
14 |
+
|
15 |
+
trainer_SST_lambda = {}
|
16 |
+
trainer_SST_lambda['de'] = pronunciationTrainer.getTrainer("de")
|
17 |
+
trainer_SST_lambda['en'] = pronunciationTrainer.getTrainer("en")
|
18 |
+
|
19 |
+
transform = Resample(orig_freq=48000, new_freq=16000)
|
20 |
+
|
21 |
+
|
22 |
+
def lambda_handler(event, context):
|
23 |
+
|
24 |
+
data = json.loads(event['body'])
|
25 |
+
|
26 |
+
real_text = data['title']
|
27 |
+
file_bytes = base64.b64decode(
|
28 |
+
data['base64Audio'][22:].encode('utf-8'))
|
29 |
+
language = data['language']
|
30 |
+
|
31 |
+
if len(real_text) == 0:
|
32 |
+
return {
|
33 |
+
'statusCode': 200,
|
34 |
+
'headers': {
|
35 |
+
'Access-Control-Allow-Headers': '*',
|
36 |
+
'Access-Control-Allow-Credentials': "true",
|
37 |
+
'Access-Control-Allow-Origin': '*',
|
38 |
+
'Access-Control-Allow-Methods': 'OPTIONS,POST,GET'
|
39 |
+
},
|
40 |
+
'body': ''
|
41 |
+
}
|
42 |
+
|
43 |
+
start = time.time()
|
44 |
+
random_file_name = './'+utilsFileIO.generateRandomString()+'.ogg'
|
45 |
+
f = open(random_file_name, 'wb')
|
46 |
+
f.write(file_bytes)
|
47 |
+
f.close()
|
48 |
+
print('Time for saving binary in file: ', str(time.time()-start))
|
49 |
+
|
50 |
+
start = time.time()
|
51 |
+
signal, fs = audioread_load(random_file_name)
|
52 |
+
|
53 |
+
signal = transform(torch.Tensor(signal)).unsqueeze(0)
|
54 |
+
|
55 |
+
print('Time for loading .ogg file file: ', str(time.time()-start))
|
56 |
+
|
57 |
+
result = trainer_SST_lambda[language].processAudioForGivenText(
|
58 |
+
signal, real_text)
|
59 |
+
|
60 |
+
start = time.time()
|
61 |
+
os.remove(random_file_name)
|
62 |
+
print('Time for deleting file: ', str(time.time()-start))
|
63 |
+
|
64 |
+
start = time.time()
|
65 |
+
real_transcripts_ipa = ' '.join(
|
66 |
+
[word[0] for word in result['real_and_transcribed_words_ipa']])
|
67 |
+
matched_transcripts_ipa = ' '.join(
|
68 |
+
[word[1] for word in result['real_and_transcribed_words_ipa']])
|
69 |
+
|
70 |
+
real_transcripts = ' '.join(
|
71 |
+
[word[0] for word in result['real_and_transcribed_words']])
|
72 |
+
matched_transcripts = ' '.join(
|
73 |
+
[word[1] for word in result['real_and_transcribed_words']])
|
74 |
+
|
75 |
+
words_real = real_transcripts.lower().split()
|
76 |
+
mapped_words = matched_transcripts.split()
|
77 |
+
|
78 |
+
is_letter_correct_all_words = ''
|
79 |
+
for idx, word_real in enumerate(words_real):
|
80 |
+
|
81 |
+
mapped_letters, mapped_letters_indices = wm.get_best_mapped_words(
|
82 |
+
mapped_words[idx], word_real)
|
83 |
+
|
84 |
+
is_letter_correct = wm.getWhichLettersWereTranscribedCorrectly(
|
85 |
+
word_real, mapped_letters) # , mapped_letters_indices)
|
86 |
+
|
87 |
+
is_letter_correct_all_words += ''.join([str(is_correct)
|
88 |
+
for is_correct in is_letter_correct]) + ' '
|
89 |
+
|
90 |
+
pair_accuracy_category = ' '.join(
|
91 |
+
[str(category) for category in result['pronunciation_categories']])
|
92 |
+
print('Time to post-process results: ', str(time.time()-start))
|
93 |
+
|
94 |
+
res = {'real_transcript': result['recording_transcript'],
|
95 |
+
'ipa_transcript': result['recording_ipa'],
|
96 |
+
'pronunciation_accuracy': str(int(result['pronunciation_accuracy'])),
|
97 |
+
'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
|
98 |
+
'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
|
99 |
+
'pair_accuracy_category': pair_accuracy_category,
|
100 |
+
'start_time': result['start_time'],
|
101 |
+
'end_time': result['end_time'],
|
102 |
+
'is_letter_correct_all_words': is_letter_correct_all_words}
|
103 |
+
|
104 |
+
return json.dumps(res)
|
105 |
+
|
106 |
+
# From Librosa
|
107 |
+
|
108 |
+
|
109 |
+
def audioread_load(path, offset=0.0, duration=None, dtype=np.float32):
|
110 |
+
"""Load an audio buffer using audioread.
|
111 |
+
|
112 |
+
This loads one block at a time, and then concatenates the results.
|
113 |
+
"""
|
114 |
+
|
115 |
+
y = []
|
116 |
+
with audioread.audio_open(path) as input_file:
|
117 |
+
sr_native = input_file.samplerate
|
118 |
+
n_channels = input_file.channels
|
119 |
+
|
120 |
+
s_start = int(np.round(sr_native * offset)) * n_channels
|
121 |
+
|
122 |
+
if duration is None:
|
123 |
+
s_end = np.inf
|
124 |
+
else:
|
125 |
+
s_end = s_start + \
|
126 |
+
(int(np.round(sr_native * duration)) * n_channels)
|
127 |
+
|
128 |
+
n = 0
|
129 |
+
|
130 |
+
for frame in input_file:
|
131 |
+
frame = buf_to_float(frame, dtype=dtype)
|
132 |
+
n_prev = n
|
133 |
+
n = n + len(frame)
|
134 |
+
|
135 |
+
if n < s_start:
|
136 |
+
# offset is after the current frame
|
137 |
+
# keep reading
|
138 |
+
continue
|
139 |
+
|
140 |
+
if s_end < n_prev:
|
141 |
+
# we're off the end. stop reading
|
142 |
+
break
|
143 |
+
|
144 |
+
if s_end < n:
|
145 |
+
# the end is in this frame. crop.
|
146 |
+
frame = frame[: s_end - n_prev]
|
147 |
+
|
148 |
+
if n_prev <= s_start <= n:
|
149 |
+
# beginning is in this frame
|
150 |
+
frame = frame[(s_start - n_prev):]
|
151 |
+
|
152 |
+
# tack on the current frame
|
153 |
+
y.append(frame)
|
154 |
+
|
155 |
+
if y:
|
156 |
+
y = np.concatenate(y)
|
157 |
+
if n_channels > 1:
|
158 |
+
y = y.reshape((-1, n_channels)).T
|
159 |
+
else:
|
160 |
+
y = np.empty(0, dtype=dtype)
|
161 |
+
|
162 |
+
return y, sr_native
|
163 |
+
|
164 |
+
# From Librosa
|
165 |
+
|
166 |
+
|
167 |
+
def buf_to_float(x, n_bytes=2, dtype=np.float32):
|
168 |
+
"""Convert an integer buffer to floating point values.
|
169 |
+
This is primarily useful when loading integer-valued wav data
|
170 |
+
into numpy arrays.
|
171 |
+
|
172 |
+
Parameters
|
173 |
+
----------
|
174 |
+
x : np.ndarray [dtype=int]
|
175 |
+
The integer-valued data buffer
|
176 |
+
|
177 |
+
n_bytes : int [1, 2, 4]
|
178 |
+
The number of bytes per sample in ``x``
|
179 |
+
|
180 |
+
dtype : numeric type
|
181 |
+
The target output type (default: 32-bit float)
|
182 |
+
|
183 |
+
Returns
|
184 |
+
-------
|
185 |
+
x_float : np.ndarray [dtype=float]
|
186 |
+
The input data buffer cast to floating point
|
187 |
+
"""
|
188 |
+
|
189 |
+
# Invert the scale of the data
|
190 |
+
scale = 1.0 / float(1 << ((8 * n_bytes) - 1))
|
191 |
+
|
192 |
+
# Construct the format string
|
193 |
+
fmt = "<i{:d}".format(n_bytes)
|
194 |
+
|
195 |
+
# Rescale and format the data buffer
|
196 |
+
return scale * np.frombuffer(x, fmt).astype(dtype)
|
lambdaTTS.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import models
|
3 |
+
import soundfile as sf
|
4 |
+
import json
|
5 |
+
import AIModels
|
6 |
+
#from flask import Response
|
7 |
+
import utilsFileIO
|
8 |
+
import os
|
9 |
+
import base64
|
10 |
+
|
11 |
+
sampling_rate = 16000
|
12 |
+
model_TTS_lambda = AIModels.NeuralTTS(models.getTTSModel('de'), sampling_rate)
|
13 |
+
|
14 |
+
|
15 |
+
def lambda_handler(event, context):
|
16 |
+
|
17 |
+
body = json.loads(event['body'])
|
18 |
+
|
19 |
+
text_string = body['value']
|
20 |
+
|
21 |
+
linear_factor = 0.2
|
22 |
+
audio = model_TTS_lambda.getAudioFromSentence(
|
23 |
+
text_string).detach().numpy()*linear_factor
|
24 |
+
random_file_name = utilsFileIO.generateRandomString(20)+'.wav'
|
25 |
+
|
26 |
+
sf.write('./'+random_file_name, audio, 16000)
|
27 |
+
|
28 |
+
with open(random_file_name, "rb") as f:
|
29 |
+
audio_byte_array = f.read()
|
30 |
+
|
31 |
+
os.remove(random_file_name)
|
32 |
+
|
33 |
+
|
34 |
+
return {
|
35 |
+
'statusCode': 200,
|
36 |
+
'headers': {
|
37 |
+
'Access-Control-Allow-Headers': '*',
|
38 |
+
'Access-Control-Allow-Origin': '*',
|
39 |
+
'Access-Control-Allow-Methods': 'OPTIONS,POST,GET'
|
40 |
+
},
|
41 |
+
'body': json.dumps(
|
42 |
+
{
|
43 |
+
"wavBase64": str(base64.b64encode(audio_byte_array))[2:-1],
|
44 |
+
},
|
45 |
+
)
|
46 |
+
}
|
models.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
import pickle
|
5 |
+
|
6 |
+
|
7 |
+
import pickle
|
8 |
+
|
9 |
+
|
10 |
+
def getASRModel(language: str) -> nn.Module:
|
11 |
+
|
12 |
+
if language == 'de':
|
13 |
+
|
14 |
+
model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
|
15 |
+
model='silero_stt',
|
16 |
+
language='de',
|
17 |
+
device=torch.device('cpu'))
|
18 |
+
|
19 |
+
elif language == 'en':
|
20 |
+
model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
|
21 |
+
model='silero_stt',
|
22 |
+
language='en',
|
23 |
+
device=torch.device('cpu'))
|
24 |
+
elif language == 'fr':
|
25 |
+
model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
|
26 |
+
model='silero_stt',
|
27 |
+
language='fr',
|
28 |
+
device=torch.device('cpu'))
|
29 |
+
|
30 |
+
return (model, decoder)
|
31 |
+
|
32 |
+
|
33 |
+
def getTTSModel(language: str) -> nn.Module:
|
34 |
+
|
35 |
+
if language == 'de':
|
36 |
+
|
37 |
+
speaker = 'thorsten_v2' # 16 kHz
|
38 |
+
model, _ = torch.hub.load(repo_or_dir='snakers4/silero-models',
|
39 |
+
model='silero_tts',
|
40 |
+
language=language,
|
41 |
+
speaker=speaker)
|
42 |
+
|
43 |
+
elif language == 'en':
|
44 |
+
speaker = 'lj_16khz' # 16 kHz
|
45 |
+
model = torch.hub.load(repo_or_dir='snakers4/silero-models',
|
46 |
+
model='silero_tts',
|
47 |
+
language=language,
|
48 |
+
speaker=speaker)
|
49 |
+
else:
|
50 |
+
raise ValueError('Language not implemented')
|
51 |
+
|
52 |
+
return model
|
53 |
+
|
54 |
+
|
55 |
+
def getTranslationModel(language: str) -> nn.Module:
|
56 |
+
from transformers import AutoTokenizer
|
57 |
+
from transformers import AutoModelForSeq2SeqLM
|
58 |
+
if language == 'de':
|
59 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(
|
60 |
+
"Helsinki-NLP/opus-mt-de-en")
|
61 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
62 |
+
"Helsinki-NLP/opus-mt-de-en")
|
63 |
+
# Cache models to avoid Hugging face processing
|
64 |
+
with open('translation_model_de.pickle', 'wb') as handle:
|
65 |
+
pickle.dump(model, handle)
|
66 |
+
with open('translation_tokenizer_de.pickle', 'wb') as handle:
|
67 |
+
pickle.dump(tokenizer, handle)
|
68 |
+
else:
|
69 |
+
raise ValueError('Language not implemented')
|
70 |
+
|
71 |
+
return model, tokenizer
|
pronunciationTrainer.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
import models as mo
|
5 |
+
import WordMetrics
|
6 |
+
import WordMatching as wm
|
7 |
+
import epitran
|
8 |
+
import ModelInterfaces as mi
|
9 |
+
import AIModels
|
10 |
+
import RuleBasedModels
|
11 |
+
from string import punctuation
|
12 |
+
import time
|
13 |
+
|
14 |
+
|
15 |
+
def getTrainer(language: str):
|
16 |
+
|
17 |
+
device = torch.device('cpu')
|
18 |
+
|
19 |
+
model, decoder = mo.getASRModel(language)
|
20 |
+
model = model.to(device)
|
21 |
+
model.eval()
|
22 |
+
asr_model = AIModels.NeuralASR(model, decoder)
|
23 |
+
|
24 |
+
if language == 'de':
|
25 |
+
phonem_converter = RuleBasedModels.EpitranPhonemConverter(
|
26 |
+
epitran.Epitran('deu-Latn'))
|
27 |
+
elif language == 'en':
|
28 |
+
phonem_converter = RuleBasedModels.EngPhonemConverter()
|
29 |
+
else:
|
30 |
+
raise ValueError('Language not implemented')
|
31 |
+
|
32 |
+
trainer = PronunciationTrainer(
|
33 |
+
asr_model, phonem_converter)
|
34 |
+
|
35 |
+
return trainer
|
36 |
+
|
37 |
+
|
38 |
+
class PronunciationTrainer:
|
39 |
+
current_transcript: str
|
40 |
+
current_ipa: str
|
41 |
+
|
42 |
+
current_recorded_audio: torch.Tensor
|
43 |
+
current_recorded_transcript: str
|
44 |
+
current_recorded_word_locations: list
|
45 |
+
current_recorded_intonations: torch.tensor
|
46 |
+
current_words_pronunciation_accuracy = []
|
47 |
+
categories_thresholds = np.array([80, 60, 59])
|
48 |
+
|
49 |
+
sampling_rate = 16000
|
50 |
+
|
51 |
+
def __init__(self, asr_model: mi.IASRModel, word_to_ipa_coverter: mi.ITextToPhonemModel) -> None:
|
52 |
+
self.asr_model = asr_model
|
53 |
+
self.ipa_converter = word_to_ipa_coverter
|
54 |
+
|
55 |
+
def getTranscriptAndWordsLocations(self, audio_length_in_samples: int):
|
56 |
+
|
57 |
+
audio_transcript = self.asr_model.getTranscript()
|
58 |
+
word_locations_in_samples = self.asr_model.getWordLocations()
|
59 |
+
|
60 |
+
fade_duration_in_samples = 0.05*self.sampling_rate
|
61 |
+
word_locations_in_samples = [(int(np.maximum(0, word['start_ts']-fade_duration_in_samples)), int(np.minimum(
|
62 |
+
audio_length_in_samples-1, word['end_ts']+fade_duration_in_samples))) for word in word_locations_in_samples]
|
63 |
+
|
64 |
+
return audio_transcript, word_locations_in_samples
|
65 |
+
|
66 |
+
def getWordsRelativeIntonation(self, Audio: torch.tensor, word_locations: list):
|
67 |
+
intonations = torch.zeros((len(word_locations), 1))
|
68 |
+
intonation_fade_samples = 0.3*self.sampling_rate
|
69 |
+
print(intonations.shape)
|
70 |
+
for word in range(len(word_locations)):
|
71 |
+
intonation_start = int(np.maximum(
|
72 |
+
0, word_locations[word][0]-intonation_fade_samples))
|
73 |
+
intonation_end = int(np.minimum(
|
74 |
+
Audio.shape[1]-1, word_locations[word][1]+intonation_fade_samples))
|
75 |
+
intonations[word] = torch.sqrt(torch.mean(
|
76 |
+
Audio[0][intonation_start:intonation_end]**2))
|
77 |
+
|
78 |
+
intonations = intonations/torch.mean(intonations)
|
79 |
+
return intonations
|
80 |
+
|
81 |
+
##################### ASR Functions ###########################
|
82 |
+
|
83 |
+
def processAudioForGivenText(self, recordedAudio: torch.Tensor = None, real_text=None):
|
84 |
+
|
85 |
+
start = time.time()
|
86 |
+
recording_transcript, recording_ipa, word_locations = self.getAudioTranscript(
|
87 |
+
recordedAudio)
|
88 |
+
print('Time for NN to transcript audio: ', str(time.time()-start))
|
89 |
+
|
90 |
+
start = time.time()
|
91 |
+
real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices = self.matchSampleAndRecordedWords(
|
92 |
+
real_text, recording_transcript)
|
93 |
+
print('Time for matching transcripts: ', str(time.time()-start))
|
94 |
+
|
95 |
+
start_time, end_time = self.getWordLocationsFromRecordInSeconds(
|
96 |
+
word_locations, mapped_words_indices)
|
97 |
+
|
98 |
+
pronunciation_accuracy, current_words_pronunciation_accuracy = self.getPronunciationAccuracy(
|
99 |
+
real_and_transcribed_words) # _ipa
|
100 |
+
|
101 |
+
pronunciation_categories = self.getWordsPronunciationCategory(
|
102 |
+
current_words_pronunciation_accuracy)
|
103 |
+
|
104 |
+
result = {'recording_transcript': recording_transcript,
|
105 |
+
'real_and_transcribed_words': real_and_transcribed_words,
|
106 |
+
'recording_ipa': recording_ipa, 'start_time': start_time, 'end_time': end_time,
|
107 |
+
'real_and_transcribed_words_ipa': real_and_transcribed_words_ipa, 'pronunciation_accuracy': pronunciation_accuracy,
|
108 |
+
'pronunciation_categories': pronunciation_categories}
|
109 |
+
|
110 |
+
return result
|
111 |
+
|
112 |
+
def getAudioTranscript(self, recordedAudio: torch.Tensor = None):
|
113 |
+
current_recorded_audio = recordedAudio
|
114 |
+
|
115 |
+
current_recorded_audio = self.preprocessAudio(
|
116 |
+
current_recorded_audio)
|
117 |
+
|
118 |
+
self.asr_model.processAudio(current_recorded_audio)
|
119 |
+
|
120 |
+
current_recorded_transcript, current_recorded_word_locations = self.getTranscriptAndWordsLocations(
|
121 |
+
current_recorded_audio.shape[1])
|
122 |
+
current_recorded_ipa = self.ipa_converter.convertToPhonem(
|
123 |
+
current_recorded_transcript)
|
124 |
+
|
125 |
+
return current_recorded_transcript, current_recorded_ipa, current_recorded_word_locations
|
126 |
+
|
127 |
+
def getWordLocationsFromRecordInSeconds(self, word_locations, mapped_words_indices) -> list:
|
128 |
+
start_time = []
|
129 |
+
end_time = []
|
130 |
+
for word_idx in range(len(mapped_words_indices)):
|
131 |
+
start_time.append(float(word_locations[mapped_words_indices[word_idx]]
|
132 |
+
[0])/self.sampling_rate)
|
133 |
+
end_time.append(float(word_locations[mapped_words_indices[word_idx]]
|
134 |
+
[1])/self.sampling_rate)
|
135 |
+
return ' '.join([str(time) for time in start_time]), ' '.join([str(time) for time in end_time])
|
136 |
+
|
137 |
+
##################### END ASR Functions ###########################
|
138 |
+
|
139 |
+
##################### Evaluation Functions ###########################
|
140 |
+
def matchSampleAndRecordedWords(self, real_text, recorded_transcript):
|
141 |
+
words_estimated = recorded_transcript.split()
|
142 |
+
|
143 |
+
if real_text is None:
|
144 |
+
words_real = self.current_transcript[0].split()
|
145 |
+
else:
|
146 |
+
words_real = real_text.split()
|
147 |
+
|
148 |
+
mapped_words, mapped_words_indices = wm.get_best_mapped_words(
|
149 |
+
words_estimated, words_real)
|
150 |
+
|
151 |
+
real_and_transcribed_words = []
|
152 |
+
real_and_transcribed_words_ipa = []
|
153 |
+
for word_idx in range(len(words_real)):
|
154 |
+
if word_idx >= len(mapped_words)-1:
|
155 |
+
mapped_words.append('-')
|
156 |
+
real_and_transcribed_words.append(
|
157 |
+
(words_real[word_idx], mapped_words[word_idx]))
|
158 |
+
real_and_transcribed_words_ipa.append((self.ipa_converter.convertToPhonem(words_real[word_idx]),
|
159 |
+
self.ipa_converter.convertToPhonem(mapped_words[word_idx])))
|
160 |
+
return real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices
|
161 |
+
|
162 |
+
def getPronunciationAccuracy(self, real_and_transcribed_words_ipa) -> float:
|
163 |
+
total_mismatches = 0.
|
164 |
+
number_of_phonemes = 0.
|
165 |
+
current_words_pronunciation_accuracy = []
|
166 |
+
for pair in real_and_transcribed_words_ipa:
|
167 |
+
|
168 |
+
real_without_punctuation = self.removePunctuation(pair[0]).lower()
|
169 |
+
number_of_word_mismatches = WordMetrics.edit_distance_python(
|
170 |
+
real_without_punctuation, self.removePunctuation(pair[1]).lower())
|
171 |
+
total_mismatches += number_of_word_mismatches
|
172 |
+
number_of_phonemes_in_word = len(real_without_punctuation)
|
173 |
+
number_of_phonemes += number_of_phonemes_in_word
|
174 |
+
|
175 |
+
current_words_pronunciation_accuracy.append(float(
|
176 |
+
number_of_phonemes_in_word-number_of_word_mismatches)/number_of_phonemes_in_word*100)
|
177 |
+
|
178 |
+
percentage_of_correct_pronunciations = (
|
179 |
+
number_of_phonemes-total_mismatches)/number_of_phonemes*100
|
180 |
+
|
181 |
+
return np.round(percentage_of_correct_pronunciations), current_words_pronunciation_accuracy
|
182 |
+
|
183 |
+
def removePunctuation(self, word: str) -> str:
|
184 |
+
return ''.join([char for char in word if char not in punctuation])
|
185 |
+
|
186 |
+
def getWordsPronunciationCategory(self, accuracies) -> list:
|
187 |
+
categories = []
|
188 |
+
|
189 |
+
for accuracy in accuracies:
|
190 |
+
categories.append(
|
191 |
+
self.getPronunciationCategoryFromAccuracy(accuracy))
|
192 |
+
|
193 |
+
return categories
|
194 |
+
|
195 |
+
def getPronunciationCategoryFromAccuracy(self, accuracy) -> int:
|
196 |
+
return np.argmin(abs(self.categories_thresholds-accuracy))
|
197 |
+
|
198 |
+
def preprocessAudio(self, audio: torch.tensor) -> torch.tensor:
|
199 |
+
audio = audio-torch.mean(audio)
|
200 |
+
audio = audio/torch.max(torch.abs(audio))
|
201 |
+
return audio
|
requirements.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-f https://download.pytorch.org/whl/torch_stable.html
|
2 |
+
torch==1.10.1
|
3 |
+
torchaudio==0.10.1
|
4 |
+
soundfile==0.10.3.post1
|
5 |
+
omegaconf
|
6 |
+
epitran==1.15
|
7 |
+
audioread
|
8 |
+
requests
|
9 |
+
dtwalign
|
10 |
+
eng_to_ipa
|
11 |
+
pandas
|
12 |
+
flask
|
13 |
+
flask_cors
|
14 |
+
pickle-mixin
|
15 |
+
sqlalchemy
|
16 |
+
transformers
|
17 |
+
sentencepiece
|
18 |
+
ortools==9.2.9972
|
static/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
static/ASR_bad.wav
ADDED
Binary file (425 kB). View file
|
|
static/ASR_good.wav
ADDED
Binary file (425 kB). View file
|
|
static/ASR_okay.wav
ADDED
Binary file (425 kB). View file
|
|
static/css/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
static/css/style-new.css
ADDED
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
body {
|
2 |
+
background: #f2f2f2;
|
3 |
+
}
|
4 |
+
|
5 |
+
|
6 |
+
.expanded {
|
7 |
+
margin: auto;
|
8 |
+
align-content: center;
|
9 |
+
}
|
10 |
+
|
11 |
+
p {
|
12 |
+
overflow: auto;
|
13 |
+
}
|
14 |
+
|
15 |
+
h1 {
|
16 |
+
margin-left: 2%;
|
17 |
+
}
|
18 |
+
|
19 |
+
a.disabled {
|
20 |
+
pointer-events: none;
|
21 |
+
color: #ccc;
|
22 |
+
background-color: #ccc;
|
23 |
+
}
|
24 |
+
|
25 |
+
|
26 |
+
.horizontal-flexbox {
|
27 |
+
height: 100%;
|
28 |
+
width: 100%;
|
29 |
+
display: flex;
|
30 |
+
}
|
31 |
+
|
32 |
+
/* ############## Next button ##### */
|
33 |
+
.button-next {
|
34 |
+
border-radius: 4px;
|
35 |
+
display: block;
|
36 |
+
border: none;
|
37 |
+
color: #FFFFFF;
|
38 |
+
text-align: left;
|
39 |
+
font-size: 3em;
|
40 |
+
box-sizing: border-box;
|
41 |
+
position: absolute;
|
42 |
+
top: 0;
|
43 |
+
left: 0%;
|
44 |
+
right: 2%;
|
45 |
+
bottom: 2%;
|
46 |
+
background-color: #58636d;
|
47 |
+
width: 10em;
|
48 |
+
|
49 |
+
transition: all 0.5s;
|
50 |
+
cursor: pointer;
|
51 |
+
}
|
52 |
+
|
53 |
+
.button-next:hover {
|
54 |
+
background-color: #6383a1 !important;
|
55 |
+
}
|
56 |
+
|
57 |
+
.button-next span {
|
58 |
+
cursor: pointer;
|
59 |
+
display: inline-block;
|
60 |
+
position: relative;
|
61 |
+
transition: 0.5s;
|
62 |
+
}
|
63 |
+
|
64 |
+
/*
|
65 |
+
.button-next span:after {
|
66 |
+
content: '\00bb';
|
67 |
+
position: absolute;
|
68 |
+
opacity: 0;
|
69 |
+
top: 0;
|
70 |
+
right: -20px;
|
71 |
+
transition: 0.5s;
|
72 |
+
|
73 |
+
}*/
|
74 |
+
|
75 |
+
.button-next:hover span {
|
76 |
+
padding-right: 25px;
|
77 |
+
}
|
78 |
+
|
79 |
+
.button-next:hover span:after {
|
80 |
+
opacity: 1;
|
81 |
+
right: 0;
|
82 |
+
}
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
/* ############# Texts ############## */
|
87 |
+
|
88 |
+
.main-text {
|
89 |
+
font-size: 2.5em;
|
90 |
+
max-width: 87%;
|
91 |
+
}
|
92 |
+
|
93 |
+
.ipa-text {
|
94 |
+
font-size: 1.8em;
|
95 |
+
max-width: 87%;
|
96 |
+
}
|
97 |
+
|
98 |
+
.ipa-text-small {
|
99 |
+
font-size: 1.5em;
|
100 |
+
}
|
101 |
+
|
102 |
+
.accuracy-text {
|
103 |
+
/*font-family: "Dank Mono", ui-monospace, monospace;*/
|
104 |
+
background: linear-gradient(to right,
|
105 |
+
rgb(54, 56, 80),
|
106 |
+
rgb(21, 60, 87));
|
107 |
+
background-clip: text;
|
108 |
+
-webkit-background-clip: text;
|
109 |
+
-webkit-text-fill-color: transparent;
|
110 |
+
text-align: center;
|
111 |
+
font-size: 2em;
|
112 |
+
margin-left: 2%;
|
113 |
+
left: 0%;
|
114 |
+
}
|
115 |
+
|
116 |
+
.main-text-div {
|
117 |
+
overflow-y: auto;
|
118 |
+
position: absolute;
|
119 |
+
left: 10%;
|
120 |
+
right: 10%;
|
121 |
+
top: 2%;
|
122 |
+
bottom: 2%;
|
123 |
+
}
|
124 |
+
|
125 |
+
/* ############# Card Container ############## */
|
126 |
+
.container {
|
127 |
+
display: block;
|
128 |
+
position: absolute;
|
129 |
+
left: 2%;
|
130 |
+
top: 18%;
|
131 |
+
transform: translate(-0%, -0%);
|
132 |
+
height: 59%;
|
133 |
+
width: 96%;
|
134 |
+
max-width: 96%;
|
135 |
+
background: #ffff;
|
136 |
+
overflow: hidden;
|
137 |
+
border-radius: 20px;
|
138 |
+
box-shadow: 0 0 20px 8px #d0d0d0;
|
139 |
+
}
|
140 |
+
|
141 |
+
.container-small {
|
142 |
+
position: fixed;
|
143 |
+
left: 68%;
|
144 |
+
top: 79%;
|
145 |
+
transform: translate(-0%, -0%);
|
146 |
+
height: 7%;
|
147 |
+
width: 30%;
|
148 |
+
background: #ffff;
|
149 |
+
overflow: hidden;
|
150 |
+
border-radius: 20px;
|
151 |
+
box-shadow: 0 0 20px 8px #d0d0d0;
|
152 |
+
}
|
153 |
+
|
154 |
+
/* ############# Icon Button ############## */
|
155 |
+
|
156 |
+
.round-button {
|
157 |
+
box-sizing: border-box;
|
158 |
+
display: block;
|
159 |
+
width: 3em;
|
160 |
+
/* 80px */
|
161 |
+
height: 3em;
|
162 |
+
left: 0%;
|
163 |
+
padding-top: 14px;
|
164 |
+
padding-left: 0px;
|
165 |
+
line-height: 0px;
|
166 |
+
border: 6px solid #fff;
|
167 |
+
border-radius: 50%;
|
168 |
+
color: #f5f5f5;
|
169 |
+
text-align: center;
|
170 |
+
text-decoration: none;
|
171 |
+
background-color: #467387;
|
172 |
+
font-size: 20px;
|
173 |
+
font-weight: bold;
|
174 |
+
transition: all 0.3s ease;
|
175 |
+
}
|
176 |
+
|
177 |
+
.round-button:hover {
|
178 |
+
background-color: rgba(0, 0, 0, 0.8);
|
179 |
+
box-shadow: 0px 0px 10px #61a4d4;
|
180 |
+
text-shadow: 0px 0px 10px #61a4d4;
|
181 |
+
}
|
182 |
+
|
183 |
+
.icon-text {
|
184 |
+
font-size: 1em !important;
|
185 |
+
text-align: center;
|
186 |
+
}
|
187 |
+
|
188 |
+
.round-button-mic {
|
189 |
+
box-sizing: border-box;
|
190 |
+
display: block;
|
191 |
+
width: 4.5em;
|
192 |
+
/* 80px */
|
193 |
+
height: 4.5em;
|
194 |
+
padding-top: 14px;
|
195 |
+
padding-left: -2.25em;
|
196 |
+
line-height: 0px;
|
197 |
+
border: 6px solid #fff;
|
198 |
+
border-radius: 50%;
|
199 |
+
color: #f5f5f5;
|
200 |
+
text-align: center;
|
201 |
+
text-decoration: none;
|
202 |
+
background-color: #49d67d;
|
203 |
+
/*#467387;*/
|
204 |
+
font-size: 20px;
|
205 |
+
font-weight: bold;
|
206 |
+
transition: all 0.3s ease;
|
207 |
+
}
|
208 |
+
|
209 |
+
.round-button-mic:hover {
|
210 |
+
background-color: #477c5b;
|
211 |
+
/*rgba(0,0,0,0.8);*/
|
212 |
+
box-shadow: 0px 0px 10px #61a4d4;
|
213 |
+
text-shadow: 0px 0px 10px #61a4d4;
|
214 |
+
}
|
215 |
+
|
216 |
+
.icon-text-mic {
|
217 |
+
font-size: 2.5em !important;
|
218 |
+
}
|
219 |
+
|
220 |
+
.icon-text-home {
|
221 |
+
font-size: 3.5em !important;
|
222 |
+
}
|
223 |
+
|
224 |
+
.mic-button-div {
|
225 |
+
position: fixed;
|
226 |
+
left: 50%;
|
227 |
+
top: 80%
|
228 |
+
}
|
229 |
+
|
230 |
+
/*############### Drop-down ############# */
|
231 |
+
.dropbtn {
|
232 |
+
background-color: #ffffff;
|
233 |
+
color: rgb(50, 71, 165);
|
234 |
+
padding: 0px;
|
235 |
+
font-size: 16px;
|
236 |
+
border: none;
|
237 |
+
}
|
238 |
+
|
239 |
+
.dropdown {
|
240 |
+
position: relative;
|
241 |
+
display: inline-block;
|
242 |
+
}
|
243 |
+
|
244 |
+
.dropdown-content {
|
245 |
+
display: none;
|
246 |
+
position: absolute;
|
247 |
+
background-color: #ffffff;
|
248 |
+
min-width: 160px;
|
249 |
+
box-shadow: 0px 8px 16px 0px rgba(0, 0, 0, 0.2);
|
250 |
+
z-index: 1;
|
251 |
+
}
|
252 |
+
|
253 |
+
.dropdown-content a {
|
254 |
+
color: black;
|
255 |
+
padding: 12px 16px;
|
256 |
+
text-decoration: none;
|
257 |
+
display: block;
|
258 |
+
}
|
259 |
+
|
260 |
+
.dropdown-content a:hover {
|
261 |
+
background-color: #ddd;
|
262 |
+
}
|
263 |
+
|
264 |
+
.dropdown:hover .dropdown-content {
|
265 |
+
display: block;
|
266 |
+
}
|
267 |
+
|
268 |
+
.dropdown:hover .dropbtn {
|
269 |
+
background-color: #3e8e41;
|
270 |
+
}
|
271 |
+
|
272 |
+
/* ############# Arrow ############## position: relative; position: absolute;*/
|
273 |
+
.load-more {
|
274 |
+
position: fixed;
|
275 |
+
cursor: pointer;
|
276 |
+
width: 100px;
|
277 |
+
height: 100px;
|
278 |
+
margin: -0px 0 0 -0px;
|
279 |
+
min-width: 10px;
|
280 |
+
min-height: 10px;
|
281 |
+
|
282 |
+
left: 90%;
|
283 |
+
top: 45%;
|
284 |
+
border-width: 2px;
|
285 |
+
border-style: solid;
|
286 |
+
border-color: transparent;
|
287 |
+
border-bottom-color: #000;
|
288 |
+
border-right-color: #000;
|
289 |
+
border-radius: 0 0 5px 0;
|
290 |
+
|
291 |
+
transform: translate(-0%, -0%) rotate(-45deg);
|
292 |
+
}
|
293 |
+
|
294 |
+
/* ######## Radio Buttons ############## */
|
295 |
+
.radio {
|
296 |
+
background: #f6f7fd;
|
297 |
+
padding: 4px;
|
298 |
+
border-radius: 3px;
|
299 |
+
box-shadow: inset 0 0 0 3px rgba(35, 33, 45, 0.3),
|
300 |
+
0 0 0 3px rgba(185, 185, 185, 0.3);
|
301 |
+
position: relative;
|
302 |
+
}
|
303 |
+
|
304 |
+
.radio input {
|
305 |
+
width: max-content;
|
306 |
+
height: 100%;
|
307 |
+
appearance: none;
|
308 |
+
outline: none;
|
309 |
+
cursor: pointer;
|
310 |
+
border-radius: 2px;
|
311 |
+
padding: 4px 8px;
|
312 |
+
background: #454857;
|
313 |
+
color: #bdbdbdbd;
|
314 |
+
font-size: 0.8em;
|
315 |
+
font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
|
316 |
+
"Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji",
|
317 |
+
"Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
|
318 |
+
transition: all 100ms linear;
|
319 |
+
}
|
320 |
+
|
321 |
+
.radio input:checked {
|
322 |
+
background-image: linear-gradient(180deg, #4e70ce, #5197d8);
|
323 |
+
color: #fff;
|
324 |
+
box-shadow: 0 1px 1px #0000002e;
|
325 |
+
text-shadow: 0 1px 0px #79485f7a;
|
326 |
+
}
|
327 |
+
|
328 |
+
.radio input:before {
|
329 |
+
content: attr(label);
|
330 |
+
display: inline-block;
|
331 |
+
text-align: center;
|
332 |
+
width: 100%;
|
333 |
+
}
|
334 |
+
|
335 |
+
/* ############ Links and credits ####*/
|
336 |
+
|
337 |
+
.link-icon-div {
|
338 |
+
position: fixed;
|
339 |
+
left: 90.0%;
|
340 |
+
top: 0.0%;
|
341 |
+
vertical-align: middle;
|
342 |
+
align-content: flex-start;
|
343 |
+
}
|
344 |
+
|
345 |
+
.credits-icon-div {
|
346 |
+
position: fixed;
|
347 |
+
left: 90.5%;
|
348 |
+
top: 95%;
|
349 |
+
font-size: x-small;
|
350 |
+
}
|
351 |
+
|
352 |
+
.svg-icon {
|
353 |
+
padding-top: 1em;
|
354 |
+
width: 50px;
|
355 |
+
height: 50px;
|
356 |
+
|
357 |
+
}
|
358 |
+
|
359 |
+
|
360 |
+
/* ######## Switch ############## */
|
361 |
+
@media only screen and (max-width: 1200px) {
|
362 |
+
.round-button {
|
363 |
+
box-sizing: border-box;
|
364 |
+
display: block;
|
365 |
+
width: 2em;
|
366 |
+
/* 80px */
|
367 |
+
height: 2em;
|
368 |
+
left: -2.5%;
|
369 |
+
padding-top: 0.3em;
|
370 |
+
padding-left: 0px;
|
371 |
+
line-height: 0px;
|
372 |
+
border: 6px solid #fff;
|
373 |
+
border-radius: 50%;
|
374 |
+
color: #f5f5f5;
|
375 |
+
text-align: center;
|
376 |
+
text-decoration: none;
|
377 |
+
background-color: #467387;
|
378 |
+
font-size: 1em;
|
379 |
+
font-weight: bold;
|
380 |
+
transition: all 0.3s ease;
|
381 |
+
}
|
382 |
+
|
383 |
+
.container {
|
384 |
+
display: block;
|
385 |
+
position: absolute;
|
386 |
+
left: 2%;
|
387 |
+
top: 22%;
|
388 |
+
transform: translate(-0%, -0%);
|
389 |
+
height: 55%;
|
390 |
+
width: 96%;
|
391 |
+
max-width: 96%;
|
392 |
+
background: #ffff;
|
393 |
+
overflow: hidden;
|
394 |
+
border-radius: 20px;
|
395 |
+
box-shadow: 0 0 20px 8px #d0d0d0;
|
396 |
+
}
|
397 |
+
|
398 |
+
.icon-text {
|
399 |
+
font-size: 0.8em !important;
|
400 |
+
text-align: center;
|
401 |
+
}
|
402 |
+
|
403 |
+
.ipa-text-small {
|
404 |
+
font-size: small;
|
405 |
+
}
|
406 |
+
|
407 |
+
.round-button-mic {
|
408 |
+
box-sizing: border-box;
|
409 |
+
display: block;
|
410 |
+
width: 3.5em;
|
411 |
+
/* 80px */
|
412 |
+
height: 3.5em;
|
413 |
+
padding-top: 0.4em;
|
414 |
+
left: 40%;
|
415 |
+
line-height: 0px;
|
416 |
+
border: 6px solid #fff;
|
417 |
+
border-radius: 50%;
|
418 |
+
color: #f5f5f5;
|
419 |
+
text-align: center;
|
420 |
+
text-decoration: none;
|
421 |
+
background-color: #49d67d;
|
422 |
+
font-size: 20px;
|
423 |
+
font-weight: bold;
|
424 |
+
transition: all 0.3s ease;
|
425 |
+
}
|
426 |
+
|
427 |
+
.mic-button-div {
|
428 |
+
position: fixed;
|
429 |
+
left: 40%;
|
430 |
+
top: 80%
|
431 |
+
}
|
432 |
+
|
433 |
+
.link-icon-div {
|
434 |
+
position: fixed;
|
435 |
+
left: 89.0%;
|
436 |
+
top: 0.0%;
|
437 |
+
vertical-align: middle;
|
438 |
+
}
|
439 |
+
|
440 |
+
.credits-icon-div {
|
441 |
+
position: fixed;
|
442 |
+
left: 78.5%;
|
443 |
+
top: 95%;
|
444 |
+
font-size: x-small;
|
445 |
+
}
|
446 |
+
|
447 |
+
.svg-icon {
|
448 |
+
padding-top: 1em;
|
449 |
+
width: 40px;
|
450 |
+
height: 40px;
|
451 |
+
}
|
452 |
+
|
453 |
+
|
454 |
+
.icon-text-home {
|
455 |
+
font-size: 2.5em !important;
|
456 |
+
}
|
457 |
+
|
458 |
+
.accuracy-text {
|
459 |
+
font-family: "Dank Mono", ui-monospace, monospace;
|
460 |
+
background: linear-gradient(to right,
|
461 |
+
rgb(54, 56, 80),
|
462 |
+
rgb(21, 60, 87));
|
463 |
+
left: -5.0%;
|
464 |
+
background-clip: text;
|
465 |
+
-webkit-background-clip: text;
|
466 |
+
-webkit-text-fill-color: transparent;
|
467 |
+
text-align: center;
|
468 |
+
font-size: 0.8em;
|
469 |
+
}
|
470 |
+
|
471 |
+
}
|
static/javascript/callbacks.js
ADDED
@@ -0,0 +1,584 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
// Audio context initialization
|
4 |
+
let mediaRecorder, audioChunks, audioBlob, stream, audioRecorded;
|
5 |
+
const ctx = new AudioContext();
|
6 |
+
let currentAudioForPlaying;
|
7 |
+
let lettersOfWordAreCorrect = [];
|
8 |
+
|
9 |
+
// UI-related variables
|
10 |
+
const page_title = "AI Pronunciation Trainer";
|
11 |
+
const accuracy_colors = ["green", "orange", "red"];
|
12 |
+
let badScoreThreshold = 30;
|
13 |
+
let mediumScoreThreshold = 70;
|
14 |
+
let currentSample = 0;
|
15 |
+
let currentScore = 0.;
|
16 |
+
let sample_difficult = 0;
|
17 |
+
let scoreMultiplier = 1;
|
18 |
+
let playAnswerSounds = true;
|
19 |
+
let isNativeSelectedForPlayback = true;
|
20 |
+
let isRecording = false;
|
21 |
+
let serverIsInitialized = false;
|
22 |
+
let serverWorking = true;
|
23 |
+
let languageFound = true;
|
24 |
+
let currentSoundRecorded = false;
|
25 |
+
let currentText, currentIpa, real_transcripts_ipa, matched_transcripts_ipa;
|
26 |
+
let wordCategories;
|
27 |
+
let startTime, endTime;
|
28 |
+
|
29 |
+
// API related variables
|
30 |
+
let AILanguage = "de"; // Standard is German
|
31 |
+
|
32 |
+
|
33 |
+
let STScoreAPIKey = 'rll5QsTiv83nti99BW6uCmvs9BDVxSB39SVFceYb'; // Public Key. If, for some reason, you would like a private one, send-me a message and we can discuss some possibilities
|
34 |
+
let apiMainPathSample = '';// 'http://127.0.0.1:3001';// 'https://a3hj0l2j2m.execute-api.eu-central-1.amazonaws.com/Prod';
|
35 |
+
let apiMainPathSTS = '';// 'https://wrg7ayuv7i.execute-api.eu-central-1.amazonaws.com/Prod';
|
36 |
+
|
37 |
+
|
38 |
+
// Variables to playback accuracy sounds
|
39 |
+
let soundsPath = '../static';//'https://stscore-sounds-bucket.s3.eu-central-1.amazonaws.com';
|
40 |
+
let soundFileGood = null;
|
41 |
+
let soundFileOkay = null;
|
42 |
+
let soundFileBad = null;
|
43 |
+
|
44 |
+
// Speech generation
|
45 |
+
var synth = window.speechSynthesis;
|
46 |
+
let voice_idx = 0;
|
47 |
+
let voice_synth = null;
|
48 |
+
|
49 |
+
//############################ UI general control functions ###################
|
50 |
+
const unblockUI = () => {
|
51 |
+
document.getElementById("recordAudio").classList.remove('disabled');
|
52 |
+
document.getElementById("playSampleAudio").classList.remove('disabled');
|
53 |
+
document.getElementById("buttonNext").onclick = () => getNextSample();
|
54 |
+
document.getElementById("nextButtonDiv").classList.remove('disabled');
|
55 |
+
document.getElementById("original_script").classList.remove('disabled');
|
56 |
+
document.getElementById("buttonNext").style["background-color"] = '#58636d';
|
57 |
+
|
58 |
+
if (currentSoundRecorded)
|
59 |
+
document.getElementById("playRecordedAudio").classList.remove('disabled');
|
60 |
+
|
61 |
+
|
62 |
+
};
|
63 |
+
|
64 |
+
const blockUI = () => {
|
65 |
+
|
66 |
+
document.getElementById("recordAudio").classList.add('disabled');
|
67 |
+
document.getElementById("playSampleAudio").classList.add('disabled');
|
68 |
+
document.getElementById("buttonNext").onclick = null;
|
69 |
+
document.getElementById("original_script").classList.add('disabled');
|
70 |
+
document.getElementById("playRecordedAudio").classList.add('disabled');
|
71 |
+
|
72 |
+
document.getElementById("buttonNext").style["background-color"] = '#adadad';
|
73 |
+
|
74 |
+
|
75 |
+
};
|
76 |
+
|
77 |
+
const UIError = () => {
|
78 |
+
blockUI();
|
79 |
+
document.getElementById("buttonNext").onclick = () => getNextSample(); //If error, user can only try to get a new sample
|
80 |
+
document.getElementById("buttonNext").style["background-color"] = '#58636d';
|
81 |
+
|
82 |
+
document.getElementById("recorded_ipa_script").innerHTML = "";
|
83 |
+
document.getElementById("single_word_ipa_pair").innerHTML = "Error";
|
84 |
+
document.getElementById("ipa_script").innerHTML = "Error"
|
85 |
+
|
86 |
+
document.getElementById("main_title").innerHTML = 'Server Error';
|
87 |
+
document.getElementById("original_script").innerHTML = 'Server error. Either the daily quota of the server is over or there was some internal error. You can try to generate a new sample in a few seconds. If the error persist, try comming back tomorrow or download the local version from Github :)';
|
88 |
+
};
|
89 |
+
|
90 |
+
const UINotSupported = () => {
|
91 |
+
unblockUI();
|
92 |
+
|
93 |
+
document.getElementById("main_title").innerHTML = "Browser unsupported";
|
94 |
+
|
95 |
+
}
|
96 |
+
|
97 |
+
const UIRecordingError = () => {
|
98 |
+
unblockUI();
|
99 |
+
document.getElementById("main_title").innerHTML = "Recording error, please try again or restart page.";
|
100 |
+
startMediaDevice();
|
101 |
+
}
|
102 |
+
|
103 |
+
|
104 |
+
|
105 |
+
//################### Application state functions #######################
|
106 |
+
function updateScore(currentPronunciationScore) {
|
107 |
+
|
108 |
+
if (isNaN(currentPronunciationScore))
|
109 |
+
return;
|
110 |
+
currentScore += currentPronunciationScore * scoreMultiplier;
|
111 |
+
currentScore = Math.round(currentScore);
|
112 |
+
}
|
113 |
+
|
114 |
+
const cacheSoundFiles = async () => {
|
115 |
+
await fetch(soundsPath + '/ASR_good.wav').then(data => data.arrayBuffer()).
|
116 |
+
then(arrayBuffer => ctx.decodeAudioData(arrayBuffer)).
|
117 |
+
then(decodeAudioData => {
|
118 |
+
soundFileGood = decodeAudioData;
|
119 |
+
});
|
120 |
+
|
121 |
+
await fetch(soundsPath + '/ASR_okay.wav').then(data => data.arrayBuffer()).
|
122 |
+
then(arrayBuffer => ctx.decodeAudioData(arrayBuffer)).
|
123 |
+
then(decodeAudioData => {
|
124 |
+
soundFileOkay = decodeAudioData;
|
125 |
+
});
|
126 |
+
|
127 |
+
await fetch(soundsPath + '/ASR_bad.wav').then(data => data.arrayBuffer()).
|
128 |
+
then(arrayBuffer => ctx.decodeAudioData(arrayBuffer)).
|
129 |
+
then(decodeAudioData => {
|
130 |
+
soundFileBad = decodeAudioData;
|
131 |
+
});
|
132 |
+
}
|
133 |
+
|
134 |
+
const getNextSample = async () => {
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
blockUI();
|
139 |
+
|
140 |
+
if (!serverIsInitialized)
|
141 |
+
await initializeServer();
|
142 |
+
|
143 |
+
if (!serverWorking) {
|
144 |
+
UIError();
|
145 |
+
return;
|
146 |
+
}
|
147 |
+
|
148 |
+
if (soundFileBad == null)
|
149 |
+
cacheSoundFiles();
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
updateScore(parseFloat(document.getElementById("pronunciation_accuracy").innerHTML));
|
154 |
+
|
155 |
+
document.getElementById("main_title").innerHTML = "Processing new sample...";
|
156 |
+
|
157 |
+
|
158 |
+
if (document.getElementById('lengthCat1').checked) {
|
159 |
+
sample_difficult = 0;
|
160 |
+
scoreMultiplier = 1.3;
|
161 |
+
}
|
162 |
+
else if (document.getElementById('lengthCat2').checked) {
|
163 |
+
sample_difficult = 1;
|
164 |
+
scoreMultiplier = 1;
|
165 |
+
}
|
166 |
+
else if (document.getElementById('lengthCat3').checked) {
|
167 |
+
sample_difficult = 2;
|
168 |
+
scoreMultiplier = 1.3;
|
169 |
+
}
|
170 |
+
else if (document.getElementById('lengthCat4').checked) {
|
171 |
+
sample_difficult = 3;
|
172 |
+
scoreMultiplier = 1.6;
|
173 |
+
}
|
174 |
+
|
175 |
+
try {
|
176 |
+
await fetch(apiMainPathSample + '/getSample', {
|
177 |
+
method: "post",
|
178 |
+
body: JSON.stringify({
|
179 |
+
"category": sample_difficult.toString(), "language": AILanguage
|
180 |
+
}),
|
181 |
+
headers: { "X-Api-Key": STScoreAPIKey }
|
182 |
+
}).then(res => res.json()).
|
183 |
+
then(data => {
|
184 |
+
|
185 |
+
|
186 |
+
|
187 |
+
let doc = document.getElementById("original_script");
|
188 |
+
currentText = data.real_transcript;
|
189 |
+
doc.innerHTML = currentText;
|
190 |
+
|
191 |
+
currentIpa = data.ipa_transcript
|
192 |
+
|
193 |
+
let doc_ipa = document.getElementById("ipa_script");
|
194 |
+
doc_ipa.innerHTML = "/ " + currentIpa + " /";
|
195 |
+
|
196 |
+
document.getElementById("recorded_ipa_script").innerHTML = ""
|
197 |
+
document.getElementById("pronunciation_accuracy").innerHTML = "";
|
198 |
+
document.getElementById("single_word_ipa_pair").innerHTML = "Reference | Spoken"
|
199 |
+
document.getElementById("section_accuracy").innerHTML = "| Score: " + currentScore.toString() + " - (" + currentSample.toString() + ")";
|
200 |
+
currentSample += 1;
|
201 |
+
|
202 |
+
document.getElementById("main_title").innerHTML = page_title;
|
203 |
+
|
204 |
+
document.getElementById("translated_script").innerHTML = data.transcript_translation;
|
205 |
+
|
206 |
+
currentSoundRecorded = false;
|
207 |
+
unblockUI();
|
208 |
+
document.getElementById("playRecordedAudio").classList.add('disabled');
|
209 |
+
|
210 |
+
})
|
211 |
+
}
|
212 |
+
catch
|
213 |
+
{
|
214 |
+
UIError();
|
215 |
+
}
|
216 |
+
|
217 |
+
|
218 |
+
};
|
219 |
+
|
220 |
+
const updateRecordingState = async () => {
|
221 |
+
if (isRecording) {
|
222 |
+
stopRecording();
|
223 |
+
return
|
224 |
+
}
|
225 |
+
else {
|
226 |
+
recordSample()
|
227 |
+
return;
|
228 |
+
}
|
229 |
+
}
|
230 |
+
|
231 |
+
const generateWordModal = (word_idx) => {
|
232 |
+
|
233 |
+
document.getElementById("single_word_ipa_pair").innerHTML = wrapWordForPlayingLink(real_transcripts_ipa[word_idx], word_idx, false, "black")
|
234 |
+
+ ' | ' + wrapWordForPlayingLink(matched_transcripts_ipa[word_idx], word_idx, true, accuracy_colors[parseInt(wordCategories[word_idx])])
|
235 |
+
}
|
236 |
+
|
237 |
+
const recordSample = async () => {
|
238 |
+
|
239 |
+
document.getElementById("main_title").innerHTML = "Recording... click again when done speaking";
|
240 |
+
document.getElementById("recordIcon").innerHTML = 'pause_presentation';
|
241 |
+
blockUI();
|
242 |
+
document.getElementById("recordAudio").classList.remove('disabled');
|
243 |
+
audioChunks = [];
|
244 |
+
isRecording = true;
|
245 |
+
mediaRecorder.start();
|
246 |
+
|
247 |
+
}
|
248 |
+
|
249 |
+
const changeLanguage = (language, generateNewSample = false) => {
|
250 |
+
voices = synth.getVoices();
|
251 |
+
AILanguage = language;
|
252 |
+
languageFound = false;
|
253 |
+
let languageIdentifier, languageName;
|
254 |
+
switch (language) {
|
255 |
+
case 'de':
|
256 |
+
|
257 |
+
document.getElementById("languageBox").innerHTML = "German";
|
258 |
+
languageIdentifier = 'de';
|
259 |
+
languageName = 'Anna';
|
260 |
+
break;
|
261 |
+
|
262 |
+
case 'en':
|
263 |
+
|
264 |
+
document.getElementById("languageBox").innerHTML = "English";
|
265 |
+
languageIdentifier = 'en';
|
266 |
+
languageName = 'Daniel';
|
267 |
+
break;
|
268 |
+
};
|
269 |
+
|
270 |
+
for (idx = 0; idx < voices.length; idx++) {
|
271 |
+
if (voices[idx].lang.slice(0, 2) == languageIdentifier && voices[idx].name == languageName) {
|
272 |
+
voice_synth = voices[idx];
|
273 |
+
languageFound = true;
|
274 |
+
break;
|
275 |
+
}
|
276 |
+
|
277 |
+
}
|
278 |
+
// If specific voice not found, search anything with the same language
|
279 |
+
if (!languageFound) {
|
280 |
+
for (idx = 0; idx < voices.length; idx++) {
|
281 |
+
if (voices[idx].lang.slice(0, 2) == languageIdentifier) {
|
282 |
+
voice_synth = voices[idx];
|
283 |
+
languageFound = true;
|
284 |
+
break;
|
285 |
+
}
|
286 |
+
}
|
287 |
+
}
|
288 |
+
if (generateNewSample)
|
289 |
+
getNextSample();
|
290 |
+
}
|
291 |
+
|
292 |
+
//################### Speech-To-Score function ########################
|
293 |
+
const mediaStreamConstraints = {
|
294 |
+
audio: {
|
295 |
+
channelCount: 1,
|
296 |
+
sampleRate: 48000
|
297 |
+
}
|
298 |
+
}
|
299 |
+
|
300 |
+
|
301 |
+
const startMediaDevice = () => {
|
302 |
+
navigator.mediaDevices.getUserMedia(mediaStreamConstraints).then(_stream => {
|
303 |
+
stream = _stream
|
304 |
+
mediaRecorder = new MediaRecorder(stream);
|
305 |
+
|
306 |
+
let currentSamples = 0
|
307 |
+
mediaRecorder.ondataavailable = event => {
|
308 |
+
|
309 |
+
currentSamples += event.data.length
|
310 |
+
audioChunks.push(event.data);
|
311 |
+
};
|
312 |
+
|
313 |
+
mediaRecorder.onstop = async () => {
|
314 |
+
|
315 |
+
|
316 |
+
document.getElementById("recordIcon").innerHTML = 'mic';
|
317 |
+
blockUI();
|
318 |
+
|
319 |
+
|
320 |
+
audioBlob = new Blob(audioChunks, { type: 'audio/ogg;' });
|
321 |
+
|
322 |
+
let audioUrl = URL.createObjectURL(audioBlob);
|
323 |
+
audioRecorded = new Audio(audioUrl);
|
324 |
+
|
325 |
+
let audioBase64 = await convertBlobToBase64(audioBlob);
|
326 |
+
|
327 |
+
let minimumAllowedLength = 6;
|
328 |
+
if (audioBase64.length < minimumAllowedLength) {
|
329 |
+
setTimeout(UIRecordingError, 50); // Make sure this function finished after get called again
|
330 |
+
return;
|
331 |
+
}
|
332 |
+
|
333 |
+
try {
|
334 |
+
await fetch(apiMainPathSTS + '/GetAccuracyFromRecordedAudio', {
|
335 |
+
method: "post",
|
336 |
+
body: JSON.stringify({ "title": currentText[0], "base64Audio": audioBase64, "language": AILanguage }),
|
337 |
+
headers: { "X-Api-Key": STScoreAPIKey }
|
338 |
+
|
339 |
+
}).then(res => res.json()).
|
340 |
+
then(data => {
|
341 |
+
|
342 |
+
if (playAnswerSounds)
|
343 |
+
playSoundForAnswerAccuracy(parseFloat(data.pronunciation_accuracy))
|
344 |
+
|
345 |
+
document.getElementById("recorded_ipa_script").innerHTML = "/ " + data.ipa_transcript + " /";
|
346 |
+
document.getElementById("recordAudio").classList.add('disabled');
|
347 |
+
document.getElementById("main_title").innerHTML = page_title;
|
348 |
+
document.getElementById("pronunciation_accuracy").innerHTML = data.pronunciation_accuracy + "%";
|
349 |
+
|
350 |
+
lettersOfWordAreCorrect = data.is_letter_correct_all_words.split(" ")
|
351 |
+
|
352 |
+
|
353 |
+
startTime = data.start_time;
|
354 |
+
endTime = data.end_time;
|
355 |
+
|
356 |
+
|
357 |
+
real_transcripts_ipa = data.real_transcripts_ipa.split(" ")
|
358 |
+
matched_transcripts_ipa = data.matched_transcripts_ipa.split(" ")
|
359 |
+
wordCategories = data.pair_accuracy_category.split(" ")
|
360 |
+
let currentTextWords = currentText[0].split(" ")
|
361 |
+
|
362 |
+
coloredWords = "";
|
363 |
+
for (let word_idx = 0; word_idx < currentTextWords.length; word_idx++) {
|
364 |
+
|
365 |
+
wordTemp = '';
|
366 |
+
for (let letter_idx = 0; letter_idx < currentTextWords[word_idx].length; letter_idx++) {
|
367 |
+
letter_is_correct = lettersOfWordAreCorrect[word_idx][letter_idx] == '1'
|
368 |
+
if (letter_is_correct)
|
369 |
+
color_letter = 'green'
|
370 |
+
else
|
371 |
+
color_letter = 'red'
|
372 |
+
|
373 |
+
wordTemp += '<font color=' + color_letter + '>' + currentTextWords[word_idx][letter_idx] + "</font>"
|
374 |
+
}
|
375 |
+
currentTextWords[word_idx]
|
376 |
+
coloredWords += " " + wrapWordForIndividualPlayback(wordTemp, word_idx)
|
377 |
+
}
|
378 |
+
|
379 |
+
|
380 |
+
|
381 |
+
document.getElementById("original_script").innerHTML = coloredWords
|
382 |
+
|
383 |
+
currentSoundRecorded = true;
|
384 |
+
unblockUI();
|
385 |
+
document.getElementById("playRecordedAudio").classList.remove('disabled');
|
386 |
+
|
387 |
+
});
|
388 |
+
}
|
389 |
+
catch {
|
390 |
+
UIError();
|
391 |
+
}
|
392 |
+
};
|
393 |
+
|
394 |
+
});
|
395 |
+
};
|
396 |
+
startMediaDevice();
|
397 |
+
|
398 |
+
// ################### Audio playback ##################
|
399 |
+
const playSoundForAnswerAccuracy = async (accuracy) => {
|
400 |
+
|
401 |
+
currentAudioForPlaying = soundFileGood;
|
402 |
+
if (accuracy < mediumScoreThreshold) {
|
403 |
+
if (accuracy < badScoreThreshold) {
|
404 |
+
currentAudioForPlaying = soundFileBad;
|
405 |
+
}
|
406 |
+
else {
|
407 |
+
currentAudioForPlaying = soundFileOkay;
|
408 |
+
}
|
409 |
+
}
|
410 |
+
playback();
|
411 |
+
|
412 |
+
}
|
413 |
+
|
414 |
+
const playAudio = async () => {
|
415 |
+
|
416 |
+
document.getElementById("main_title").innerHTML = "Generating sound...";
|
417 |
+
playWithMozillaApi(currentText[0]);
|
418 |
+
document.getElementById("main_title").innerHTML = "Current Sound was played";
|
419 |
+
|
420 |
+
};
|
421 |
+
|
422 |
+
function playback() {
|
423 |
+
const playSound = ctx.createBufferSource();
|
424 |
+
playSound.buffer = currentAudioForPlaying;
|
425 |
+
playSound.connect(ctx.destination);
|
426 |
+
playSound.start(ctx.currentTime)
|
427 |
+
}
|
428 |
+
|
429 |
+
|
430 |
+
const playRecording = async (start = null, end = null) => {
|
431 |
+
blockUI();
|
432 |
+
|
433 |
+
try {
|
434 |
+
if (start == null || end == null) {
|
435 |
+
endTimeInMs = Math.round(audioRecorded.duration * 1000)
|
436 |
+
audioRecorded.addEventListener("ended", function () {
|
437 |
+
audioRecorded.currentTime = 0;
|
438 |
+
unblockUI();
|
439 |
+
document.getElementById("main_title").innerHTML = "Recorded Sound was played";
|
440 |
+
});
|
441 |
+
await audioRecorded.play();
|
442 |
+
|
443 |
+
}
|
444 |
+
else {
|
445 |
+
audioRecorded.currentTime = start;
|
446 |
+
audioRecorded.play();
|
447 |
+
durationInSeconds = end - start;
|
448 |
+
endTimeInMs = Math.round(durationInSeconds * 1000);
|
449 |
+
setTimeout(function () {
|
450 |
+
unblockUI();
|
451 |
+
audioRecorded.pause();
|
452 |
+
audioRecorded.currentTime = 0;
|
453 |
+
document.getElementById("main_title").innerHTML = "Recorded Sound was played";
|
454 |
+
}, endTimeInMs);
|
455 |
+
|
456 |
+
}
|
457 |
+
}
|
458 |
+
catch {
|
459 |
+
UINotSupported();
|
460 |
+
}
|
461 |
+
};
|
462 |
+
|
463 |
+
const playNativeAndRecordedWord = async (word_idx) => {
|
464 |
+
|
465 |
+
if (isNativeSelectedForPlayback)
|
466 |
+
playCurrentWord(word_idx)
|
467 |
+
else
|
468 |
+
playRecordedWord(word_idx);
|
469 |
+
|
470 |
+
isNativeSelectedForPlayback = !isNativeSelectedForPlayback;
|
471 |
+
}
|
472 |
+
|
473 |
+
const stopRecording = () => {
|
474 |
+
isRecording = false
|
475 |
+
mediaRecorder.stop()
|
476 |
+
document.getElementById("main_title").innerHTML = "Processing audio...";
|
477 |
+
}
|
478 |
+
|
479 |
+
|
480 |
+
const playCurrentWord = async (word_idx) => {
|
481 |
+
|
482 |
+
document.getElementById("main_title").innerHTML = "Generating word...";
|
483 |
+
playWithMozillaApi(currentText[0].split(' ')[word_idx]);
|
484 |
+
document.getElementById("main_title").innerHTML = "Word was played";
|
485 |
+
}
|
486 |
+
|
487 |
+
// TODO: Check if fallback is correct
|
488 |
+
const playWithMozillaApi = (text) => {
|
489 |
+
|
490 |
+
if (languageFound) {
|
491 |
+
blockUI();
|
492 |
+
if (voice_synth == null)
|
493 |
+
changeLanguage(AILanguage);
|
494 |
+
|
495 |
+
var utterThis = new SpeechSynthesisUtterance(text);
|
496 |
+
utterThis.voice = voice_synth;
|
497 |
+
utterThis.rate = 0.7;
|
498 |
+
utterThis.onend = function (event) {
|
499 |
+
unblockUI();
|
500 |
+
}
|
501 |
+
synth.speak(utterThis);
|
502 |
+
}
|
503 |
+
else {
|
504 |
+
UINotSupported();
|
505 |
+
}
|
506 |
+
}
|
507 |
+
|
508 |
+
const playRecordedWord = (word_idx) => {
|
509 |
+
|
510 |
+
wordStartTime = parseFloat(startTime.split(' ')[word_idx]);
|
511 |
+
wordEndTime = parseFloat(endTime.split(' ')[word_idx]);
|
512 |
+
|
513 |
+
playRecording(wordStartTime, wordEndTime);
|
514 |
+
|
515 |
+
}
|
516 |
+
|
517 |
+
// ############# Utils #####################
|
518 |
+
const convertBlobToBase64 = async (blob) => {
|
519 |
+
return await blobToBase64(blob);
|
520 |
+
}
|
521 |
+
|
522 |
+
const blobToBase64 = blob => new Promise((resolve, reject) => {
|
523 |
+
const reader = new FileReader();
|
524 |
+
reader.readAsDataURL(blob);
|
525 |
+
reader.onload = () => resolve(reader.result);
|
526 |
+
reader.onerror = error => reject(error);
|
527 |
+
});
|
528 |
+
|
529 |
+
const wrapWordForPlayingLink = (word, word_idx, isFromRecording, word_accuracy_color) => {
|
530 |
+
if (isFromRecording)
|
531 |
+
return '<a style = " white-space:nowrap; color:' + word_accuracy_color + '; " href="javascript:playRecordedWord(' + word_idx.toString() + ')" >' + word + '</a> '
|
532 |
+
else
|
533 |
+
return '<a style = " white-space:nowrap; color:' + word_accuracy_color + '; " href="javascript:playCurrentWord(' + word_idx.toString() + ')" >' + word + '</a> '
|
534 |
+
}
|
535 |
+
|
536 |
+
const wrapWordForIndividualPlayback = (word, word_idx) => {
|
537 |
+
|
538 |
+
|
539 |
+
return '<a onmouseover="generateWordModal(' + word_idx.toString() + ')" style = " white-space:nowrap; " href="javascript:playNativeAndRecordedWord(' + word_idx.toString() + ')" >' + word + '</a> '
|
540 |
+
|
541 |
+
}
|
542 |
+
|
543 |
+
// ########## Function to initialize server ###############
|
544 |
+
// This is to try to avoid aws lambda cold start
|
545 |
+
try {
|
546 |
+
fetch(apiMainPathSTS + '/GetAccuracyFromRecordedAudio', {
|
547 |
+
method: "post",
|
548 |
+
body: JSON.stringify({ "title": '', "base64Audio": '', "language": AILanguage }),
|
549 |
+
headers: { "X-Api-Key": STScoreAPIKey }
|
550 |
+
|
551 |
+
});
|
552 |
+
}
|
553 |
+
catch { }
|
554 |
+
|
555 |
+
const initializeServer = async () => {
|
556 |
+
|
557 |
+
valid_response = false;
|
558 |
+
document.getElementById("main_title").innerHTML = 'Initializing server, this may take up to 2 minutes...';
|
559 |
+
let number_of_tries = 0;
|
560 |
+
let maximum_number_of_tries = 4;
|
561 |
+
|
562 |
+
while (!valid_response) {
|
563 |
+
if (number_of_tries > maximum_number_of_tries) {
|
564 |
+
serverWorking = false;
|
565 |
+
break;
|
566 |
+
}
|
567 |
+
|
568 |
+
try {
|
569 |
+
await fetch(apiMainPathSTS + '/GetAccuracyFromRecordedAudio', {
|
570 |
+
method: "post",
|
571 |
+
body: JSON.stringify({ "title": '', "base64Audio": '', "language": AILanguage }),
|
572 |
+
headers: { "X-Api-Key": STScoreAPIKey }
|
573 |
+
|
574 |
+
}).then(
|
575 |
+
valid_response = true);
|
576 |
+
serverIsInitialized = true;
|
577 |
+
}
|
578 |
+
catch
|
579 |
+
{
|
580 |
+
number_of_tries += 1;
|
581 |
+
}
|
582 |
+
}
|
583 |
+
}
|
584 |
+
|
templates/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
templates/main.html
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
|
4 |
+
<head>
|
5 |
+
<meta charset="UTF-8">
|
6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
7 |
+
|
8 |
+
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet"
|
9 |
+
integrity="sha384-1BmE4kWBq78iYhFldvKuhfTAU6auU8tT94WrHftjDbrCEXSU1oBoqyl2QvZ6jIW3" crossorigin="anonymous">
|
10 |
+
|
11 |
+
</script>
|
12 |
+
<link rel="stylesheet" href="../static/css/style-new.css">
|
13 |
+
<script src="../static/javascript/callbacks.js"></script>
|
14 |
+
|
15 |
+
|
16 |
+
<title>AI pronunciation trainer</title>
|
17 |
+
|
18 |
+
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js" type="text/javascript"></script>
|
19 |
+
|
20 |
+
<link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
</head>
|
25 |
+
|
26 |
+
<body style="height: 100%; width: 100%; background-color: white; max-width: 90%;">
|
27 |
+
|
28 |
+
<div>
|
29 |
+
|
30 |
+
<div style="display:flex; flex-direction: row;">
|
31 |
+
|
32 |
+
|
33 |
+
<div style="display:inline-block; margin-left: 1.5em;">
|
34 |
+
<i class="material-icons icon-text-home" style="text-align: right;" onclick="history.go(0)">home</i>
|
35 |
+
</div>
|
36 |
+
|
37 |
+
<h1 id='main_title'> AI Pronunciation Trainer
|
38 |
+
</h1>
|
39 |
+
</div>
|
40 |
+
|
41 |
+
|
42 |
+
<div class="expanded">
|
43 |
+
<div class="horizontal-flexbox" style="display:flex; flex-direction: row;">
|
44 |
+
|
45 |
+
|
46 |
+
<p class="accuracy-text" style="font-size:1em; text-align: left; padding-top: 3px; padding-left: 5px;">
|
47 |
+
Language: </p>
|
48 |
+
<!--dropbtn accuracy-text-->
|
49 |
+
<div class="dropdown">
|
50 |
+
<button id="languageBox" class="dropbtn accuracy-text"
|
51 |
+
style="font-size:1em; text-align: left; padding-top: 3px; padding-left: 0px;">German</button>
|
52 |
+
<div class="dropdown-content">
|
53 |
+
<a href="javascript:changeLanguage('de',true)" class="accuracy-text"
|
54 |
+
style="padding-top: 3px; ">German</a>
|
55 |
+
<a href="javascript:changeLanguage('en',true)" class="accuracy-text ;"
|
56 |
+
style="padding-top: 3px; ">English</a>
|
57 |
+
</div>
|
58 |
+
</div>
|
59 |
+
|
60 |
+
<p id="section_accuracy" class="accuracy-text"
|
61 |
+
style="text-align: left; color: black; font-size: larger;">
|
62 |
+
| Score: 0
|
63 |
+
</p>
|
64 |
+
|
65 |
+
</div>
|
66 |
+
</div>
|
67 |
+
|
68 |
+
<div style="margin-bottom: 200px;">
|
69 |
+
|
70 |
+
</div>
|
71 |
+
|
72 |
+
|
73 |
+
<div class="container">
|
74 |
+
|
75 |
+
<div class="horizontal-flexbox" style="position: absolute; top: 2%; ">
|
76 |
+
|
77 |
+
<a id="playSampleAudio" href="javascript:playAudio()" class="round-button disabled" style="color:white; text-align:center;
|
78 |
+
position: absolute; top: 2%; "><i class="material-icons icon-text">play_arrow</i>
|
79 |
+
</a>
|
80 |
+
|
81 |
+
<a id="playRecordedAudio" href="javascript:playRecording()" class="round-button disabled"
|
82 |
+
style="color:white; text-align:center; position: absolute; top: 15%; "><i
|
83 |
+
class="material-icons icon-text">record_voice_over</i>
|
84 |
+
</a>
|
85 |
+
<p id="pronunciation_accuracy" class="expanded accuracy-text"
|
86 |
+
style="text-align: center; color: black; position: absolute; top: 27%; ">
|
87 |
+
-
|
88 |
+
</p>
|
89 |
+
|
90 |
+
</div>
|
91 |
+
|
92 |
+
<div id="text-area" class="main-text-div">
|
93 |
+
|
94 |
+
<p id="original_script" class=" bigger-text text-primary main-text">Click on the bar on the
|
95 |
+
right
|
96 |
+
to
|
97 |
+
generate a
|
98 |
+
new sentence (please use chrome web browser).
|
99 |
+
</p>
|
100 |
+
<p id="ipa_script" class="text-muted bigger-text ipa-text"> Before speaking, click on the mic button
|
101 |
+
below to start recording and then click again when you're done.
|
102 |
+
</p>
|
103 |
+
<p id="recorded_ipa_script" class="text-primary ipa-text">On the left bottom you can choose the
|
104 |
+
difficult. On the upper left you can choose the language.
|
105 |
+
</p>
|
106 |
+
<p id="translated_script" class="text-muted medium-text ipa-text"> The corresponding IPA reading of each
|
107 |
+
sentence will also be displayed. If you never heard from IPA, you can check out this
|
108 |
+
<a href="https://www.youtube.com/watch?v=mzrLZi6fipA&list=RDCMUCQAUWk_yGz7bk1181DrijNw&start_radio=1&rv=mzrLZi6fipA&t=22&ab_channel=FluentForever"
|
109 |
+
target=”_blank”>playlist</a>. Try to get at least 690 points a day. Don't be shy! You can do it
|
110 |
+
:)
|
111 |
+
</p>
|
112 |
+
|
113 |
+
</div>
|
114 |
+
|
115 |
+
<div id="nextButtonDiv" style="position: absolute; left: 90%; top:0%; height: 100%;" class="flex-container">
|
116 |
+
<button id="buttonNext" class="expanded button-next" onclick="javascript:getNextSample()">
|
117 |
+
<span></span></a>
|
118 |
+
</div>
|
119 |
+
</div>
|
120 |
+
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
<div class="container-small flex expand"
|
125 |
+
style="align-items: center; text-align: center; vertical-align:middle; ">
|
126 |
+
<p id="single_word_ipa_pair" class="expand ipa-text-small"
|
127 |
+
style="text-align: center; vertical-align: middle;">Reference | Spoken
|
128 |
+
</p>
|
129 |
+
</div>
|
130 |
+
|
131 |
+
|
132 |
+
<div id="btn-record" class="expanded mic-button-div">
|
133 |
+
<a id="recordAudio" href="javascript:updateRecordingState()" class="round-button-mic disabled"
|
134 |
+
style="color:white; text-align:center; "><i id="recordIcon" class="material-icons icon-text-mic">mic</i>
|
135 |
+
</a>
|
136 |
+
</div>
|
137 |
+
|
138 |
+
|
139 |
+
<div id="radio-difficulty" class="radio" style="position: fixed; top: 95%; left: 2%;">
|
140 |
+
<input label="Random" type="radio" id="lengthCat1" name='length' onclick="javascript:getNextSample()">
|
141 |
+
<input label="Easy" type="radio" id="lengthCat2" name='length' checked onclick="javascript:getNextSample()">
|
142 |
+
<input label="Medium" type="radio" id="lengthCat3" name='length' onclick="javascript:getNextSample()">
|
143 |
+
<input label="Hard" type="radio" id="lengthCat4" name='length' onclick="javascript:getNextSample()">
|
144 |
+
</div>
|
145 |
+
|
146 |
+
</div>
|
147 |
+
|
148 |
+
|
149 |
+
<p class="credits-icon-div">By Thiago
|
150 |
+
Lobato.</p>
|
151 |
+
|
152 |
+
<div class="link-icon-div">
|
153 |
+
<a href="https://github.com/Thiagohgl/ai-pronunciation-trainer" target=”_blank”
|
154 |
+
style="text-decoration:none; vertical-align: middle; ">
|
155 |
+
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" class="svg-icon">
|
156 |
+
<path
|
157 |
+
d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z" />
|
158 |
+
</svg>
|
159 |
+
</a>
|
160 |
+
|
161 |
+
<a href="https://www.linkedin.com/in/thiagohgl/" target=”_blank”
|
162 |
+
style="text-decoration:none; vertical-align: middle; padding-top: 2.3em; ">
|
163 |
+
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" class="svg-icon">
|
164 |
+
<path
|
165 |
+
d="M19 0h-14c-2.761 0-5 2.239-5 5v14c0 2.761 2.239 5 5 5h14c2.762 0 5-2.239 5-5v-14c0-2.761-2.238-5-5-5zm-11 19h-3v-11h3v11zm-1.5-12.268c-.966 0-1.75-.79-1.75-1.764s.784-1.764 1.75-1.764 1.75.79 1.75 1.764-.783 1.764-1.75 1.764zm13.5 12.268h-3v-5.604c0-3.368-4-3.113-4 0v5.604h-3v-11h3v1.765c1.396-2.586 7-2.777 7 2.476v6.759z" />
|
166 |
+
</svg>
|
167 |
+
|
168 |
+
</a>
|
169 |
+
</div>
|
170 |
+
|
171 |
+
|
172 |
+
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
</body>
|
178 |
+
|
179 |
+
|
180 |
+
|
181 |
+
</html>
|
unitTests.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
|
3 |
+
import ModelInterfaces
|
4 |
+
import lambdaGetSample
|
5 |
+
import RuleBasedModels
|
6 |
+
import epitran
|
7 |
+
import json
|
8 |
+
import pronunciationTrainer
|
9 |
+
|
10 |
+
|
11 |
+
def test_category(category: int, threshold_min: int, threshold_max: int):
|
12 |
+
event = {'body': json.dumps({'category': category, 'language': 'de'})}
|
13 |
+
for _ in range(1000):
|
14 |
+
response = lambdaGetSample.lambda_handler(event, [])
|
15 |
+
response_dict = json.loads(response)
|
16 |
+
number_of_words = len(
|
17 |
+
response_dict['real_transcript'][0].split())
|
18 |
+
length_valid = number_of_words > threshold_min and number_of_words <= threshold_max
|
19 |
+
if not length_valid:
|
20 |
+
print('Category ', category,
|
21 |
+
' had a sentence with length ', number_of_words)
|
22 |
+
return False
|
23 |
+
return True
|
24 |
+
|
25 |
+
|
26 |
+
class TestDataset(unittest.TestCase):
|
27 |
+
|
28 |
+
def test_random_sentences(self):
|
29 |
+
|
30 |
+
self.assertFalse(test_category(0, 0, 8))
|
31 |
+
|
32 |
+
def test_easy_sentences(self):
|
33 |
+
|
34 |
+
self.assertTrue(test_category(1, 0, 8))
|
35 |
+
|
36 |
+
def test_normal_sentences(self):
|
37 |
+
self.assertTrue(test_category(2, 8, 20))
|
38 |
+
|
39 |
+
def test_hard_sentences(self):
|
40 |
+
self.assertTrue(test_category(3, 20, 10000))
|
41 |
+
|
42 |
+
|
43 |
+
def check_phonem_converter(converter: ModelInterfaces.ITextToPhonemModel, input: str, expected_output: str):
|
44 |
+
output = converter.convertToPhonem(input)
|
45 |
+
|
46 |
+
is_correct = output == expected_output
|
47 |
+
if not is_correct:
|
48 |
+
print('Conversion from "', input, '" should be "',
|
49 |
+
expected_output, '", but was "', output, '"')
|
50 |
+
return is_correct
|
51 |
+
|
52 |
+
|
53 |
+
class TestPhonemConverter(unittest.TestCase):
|
54 |
+
|
55 |
+
def test_english(self):
|
56 |
+
phonem_converter = RuleBasedModels.EngPhonemConverter()
|
57 |
+
self.assertTrue(check_phonem_converter(
|
58 |
+
phonem_converter, 'Hello, this is a test', 'hɛˈloʊ, ðɪs ɪz ə tɛst'))
|
59 |
+
|
60 |
+
def test_german(self):
|
61 |
+
phonem_converter = RuleBasedModels.EpitranPhonemConverter(
|
62 |
+
epitran.Epitran('deu-Latn'))
|
63 |
+
|
64 |
+
self.assertTrue(check_phonem_converter(
|
65 |
+
phonem_converter, 'Hallo, das ist ein Test', 'haloː, dɑːs ɪst ain tɛst'))
|
66 |
+
|
67 |
+
|
68 |
+
trainer_SST_lambda = {}
|
69 |
+
trainer_SST_lambda['de'] = pronunciationTrainer.getTrainer("de")
|
70 |
+
|
71 |
+
|
72 |
+
class TestScore(unittest.TestCase):
|
73 |
+
|
74 |
+
def test_exact_transcription(self):
|
75 |
+
words_real = 'Ich habe sehr viel glück, am leben und gesund zu sein'
|
76 |
+
|
77 |
+
real_and_transcribed_words, _, _ = trainer_SST_lambda['de'].matchSampleAndRecordedWords(
|
78 |
+
words_real, words_real)
|
79 |
+
|
80 |
+
pronunciation_accuracy, _ = trainer_SST_lambda['de'].getPronunciationAccuracy(
|
81 |
+
real_and_transcribed_words)
|
82 |
+
|
83 |
+
self.assertTrue(int(pronunciation_accuracy) == 100)
|
84 |
+
|
85 |
+
def test_incorrect_transcription(self):
|
86 |
+
words_real = 'Ich habe sehr viel glück, am leben und gesund zu sein'
|
87 |
+
words_transcribed = 'Ic hab zeh viel guck am und gesund tu sein'
|
88 |
+
|
89 |
+
real_and_transcribed_words, _, _ = trainer_SST_lambda['de'].matchSampleAndRecordedWords(
|
90 |
+
words_real, words_transcribed)
|
91 |
+
|
92 |
+
pronunciation_accuracy, _ = trainer_SST_lambda['de'].getPronunciationAccuracy(
|
93 |
+
real_and_transcribed_words)
|
94 |
+
|
95 |
+
self.assertTrue(int(pronunciation_accuracy) == 71)
|
96 |
+
|
97 |
+
|
98 |
+
if __name__ == '__main__':
|
99 |
+
unittest.main()
|
utilsFileIO.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import string
|
2 |
+
import random
|
3 |
+
|
4 |
+
|
5 |
+
def generateRandomString(str_length: int = 20):
|
6 |
+
|
7 |
+
# printing lowercase
|
8 |
+
letters = string.ascii_lowercase
|
9 |
+
return ''.join(random.choice(letters) for i in range(str_length))
|
webApp.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, render_template, request
|
2 |
+
import webbrowser
|
3 |
+
import os
|
4 |
+
from flask_cors import CORS
|
5 |
+
import json
|
6 |
+
|
7 |
+
import lambdaTTS
|
8 |
+
import lambdaSpeechToScore
|
9 |
+
import lambdaGetSample
|
10 |
+
|
11 |
+
app = Flask(__name__)
|
12 |
+
cors = CORS(app)
|
13 |
+
app.config['CORS_HEADERS'] = '*'
|
14 |
+
|
15 |
+
rootPath = ''
|
16 |
+
|
17 |
+
|
18 |
+
@app.route(rootPath+'/')
|
19 |
+
def main():
|
20 |
+
return render_template('main.html')
|
21 |
+
|
22 |
+
|
23 |
+
@app.route(rootPath+'/getAudioFromText', methods=['POST'])
|
24 |
+
def getAudioFromText():
|
25 |
+
event = {'body': json.dumps(request.get_json(force=True))}
|
26 |
+
return lambdaTTS.lambda_handler(event, [])
|
27 |
+
|
28 |
+
|
29 |
+
@app.route(rootPath+'/getSample', methods=['POST'])
|
30 |
+
def getNext():
|
31 |
+
event = {'body': json.dumps(request.get_json(force=True))}
|
32 |
+
return lambdaGetSample.lambda_handler(event, [])
|
33 |
+
|
34 |
+
|
35 |
+
@app.route(rootPath+'/GetAccuracyFromRecordedAudio', methods=['POST'])
|
36 |
+
def GetAccuracyFromRecordedAudio():
|
37 |
+
|
38 |
+
event = {'body': json.dumps(request.get_json(force=True))}
|
39 |
+
lambda_correct_output = lambdaSpeechToScore.lambda_handler(event, [])
|
40 |
+
return lambda_correct_output
|
41 |
+
|
42 |
+
|
43 |
+
if __name__ == "__main__":
|
44 |
+
language = 'de'
|
45 |
+
print(os.system('pwd'))
|
46 |
+
webbrowser.open_new('http://127.0.0.1:3000/')
|
47 |
+
app.run(host="0.0.0.0", port=3000)
|