thiagohgl commited on
Commit
28d0c5f
·
1 Parent(s): 8c4fbaf

First repository code commit

Browse files
AIModels.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ModelInterfaces
2
+ import torch
3
+ import numpy as np
4
+
5
+
6
+ class NeuralASR(ModelInterfaces.IASRModel):
7
+ word_locations_in_samples = None
8
+ audio_transcript = None
9
+
10
+ def __init__(self, model: torch.nn.Module, decoder) -> None:
11
+ super().__init__()
12
+ self.model = model
13
+ self.decoder = decoder # Decoder from CTC-outputs to transcripts
14
+
15
+ def getTranscript(self) -> str:
16
+ """Get the transcripts of the process audio"""
17
+ assert(self.audio_transcript != None,
18
+ 'Can get audio transcripts without having processed the audio')
19
+ return self.audio_transcript
20
+
21
+ def getWordLocations(self) -> list:
22
+ """Get the pair of words location from audio"""
23
+ assert(self.word_locations_in_samples != None,
24
+ 'Can get word locations without having processed the audio')
25
+
26
+ return self.word_locations_in_samples
27
+
28
+ def processAudio(self, audio: torch.Tensor):
29
+ """Process the audio"""
30
+ audio_length_in_samples = audio.shape[1]
31
+ with torch.inference_mode():
32
+ nn_output = self.model(audio)
33
+
34
+ self.audio_transcript, self.word_locations_in_samples = self.decoder(
35
+ nn_output[0, :, :].detach(), audio_length_in_samples, word_align=True)
36
+
37
+
38
+ class NeuralTTS(ModelInterfaces.ITextToSpeechModel):
39
+ def __init__(self, model: torch.nn.Module, sampling_rate: int) -> None:
40
+ super().__init__()
41
+ self.model = model
42
+ self.sampling_rate = sampling_rate
43
+
44
+ def getAudioFromSentence(self, sentence: str) -> np.array:
45
+ with torch.inference_mode():
46
+ audio_transcript = self.model.apply_tts(texts=[sentence],
47
+ sample_rate=self.sampling_rate)[0]
48
+
49
+ return audio_transcript
50
+
51
+
52
+ class NeuralTranslator(ModelInterfaces.ITranslationModel):
53
+ def __init__(self, model: torch.nn.Module, tokenizer) -> None:
54
+ super().__init__()
55
+ self.model = model
56
+ self.tokenizer = tokenizer
57
+
58
+ def translateSentence(self, sentence: str) -> str:
59
+ """Get the transcripts of the process audio"""
60
+ tokenized_text = self.tokenizer(sentence, return_tensors='pt')
61
+ translation = self.model.generate(**tokenized_text)
62
+ translated_text = self.tokenizer.batch_decode(
63
+ translation, skip_special_tokens=True)[0]
64
+
65
+ return translated_text
ModelInterfaces.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import abc
3
+ import numpy as np
4
+
5
+
6
+ class IASRModel(metaclass=abc.ABCMeta):
7
+ @classmethod
8
+ def __subclasshook__(cls, subclass):
9
+ return (hasattr(subclass, 'getTranscript') and
10
+ callable(subclass.getTranscript) and
11
+ hasattr(subclass, 'getWordLocations') and
12
+ callable(subclass.getWordLocations) and
13
+ hasattr(subclass, 'processAudio') and
14
+ callable(subclass.processAudio))
15
+
16
+ @abc.abstractmethod
17
+ def getTranscript(self) -> str:
18
+ """Get the transcripts of the process audio"""
19
+ raise NotImplementedError
20
+
21
+ @abc.abstractmethod
22
+ def getWordLocations(self) -> list:
23
+ """Get the pair of words location from audio"""
24
+ raise NotImplementedError
25
+
26
+ @abc.abstractmethod
27
+ def processAudio(self, audio):
28
+ """Process the audio"""
29
+ raise NotImplementedError
30
+
31
+
32
+ class ITranslationModel(metaclass=abc.ABCMeta):
33
+ @classmethod
34
+ def __subclasshook__(cls, subclass):
35
+ return (hasattr(subclass, 'translateSentence') and
36
+ callable(subclass.translateSentence))
37
+
38
+ @abc.abstractmethod
39
+ def translateSentence(self, str) -> str:
40
+ """Get the translation of the sentence"""
41
+ raise NotImplementedError
42
+
43
+
44
+ class ITextToSpeechModel(metaclass=abc.ABCMeta):
45
+ @classmethod
46
+ def __subclasshook__(cls, subclass):
47
+ return (hasattr(subclass, 'getAudioFromSentence') and
48
+ callable(subclass.getAudioFromSentence))
49
+
50
+ @abc.abstractmethod
51
+ def getAudioFromSentence(self, str) -> np.array:
52
+ """Get audio from sentence"""
53
+ raise NotImplementedError
54
+
55
+
56
+ class ITextToPhonemModel(metaclass=abc.ABCMeta):
57
+ @classmethod
58
+ def __subclasshook__(cls, subclass):
59
+ return (hasattr(subclass, 'convertToPhonem') and
60
+ callable(subclass.convertToPhonem))
61
+
62
+ @abc.abstractmethod
63
+ def convertToPhonem(self, str) -> str:
64
+ """Convert sentence to phonemes"""
65
+ raise NotImplementedError
README.md CHANGED
@@ -1 +1,24 @@
1
- # ai-pronunciation-trainer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Motivation
2
+ This tool uses AI to evaluate your pronunciation so you can be understood more clearly. To directly test the tool, go to https://aipronunciationtr.com (please use the chrome browser for desktop).
3
+
4
+ Often, when we want to improve our pronunciation, it is very difficult to self-assess how good we’re speaking. Asking a native, or language instructor, to constantly correct us is either impractical, due to monetary constrains, or annoying due to simply being too boring for this other person. Additionally, they may often say “it sounds okay” after your 10th try to not discourage you, even though you may still have some mistakes in your pronunciation.
5
+
6
+ The AI pronunciation trainer is a way to provide objective feedback on how well your pronunciation is in an automatic and scalable fashion, so the only limit to your improvement is your own dedication.
7
+
8
+ This project originated from a small program that I did to improve my own pronunciation. When I finished it, I believed it could be a useful tool also to other people trying to be better understood, so I decided to make a simple, more user friendly version of it.
9
+
10
+ ## Installation
11
+ To run the program, all you need to do is to install the requirements and run the main python file
12
+ ```
13
+ pip install -r requirements.txt
14
+ python webApp.py
15
+ ```
16
+ The code is pure python, thus you should be able to run it without any major issue as long as you’re a using a recent python 3.X version.
17
+
18
+ ## Online version
19
+ For the people who don’t feel comfortable running code or just want to have a quick way to use the tool, I hosted an online version of it at https://aipronunciationtr.com . It should work without any major issues in desktop-chrome, any other browser is not officially supported, although most of the functionality should work fine.
20
+
21
+ Please be aware that the usage is limited by day (I’m still not rich ;) ). If, for some reason, you would like to skip the daily usage limit, just enter in contact and we can negotiate an additional API key only for you.
22
+
23
+ ## Disclaimer
24
+ Even though the tool can be useful, my intention here was not to make an industry-grade program with 100% test coverage, no errors and full support, but rather some easy to use tool that may be useful for some people even though it may not be perfect. Thus, be aware that some small bugs may be presented. In case you find something not working, all feedback is welcome and issues may be adressed depending on their severity.
RuleBasedModels.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ModelInterfaces
2
+ import torch
3
+ import numpy as np
4
+ import epitran
5
+ import eng_to_ipa
6
+
7
+
8
+ class EpitranPhonemConverter(ModelInterfaces.ITextToPhonemModel):
9
+ word_locations_in_samples = None
10
+ audio_transcript = None
11
+
12
+ def __init__(self, epitran_model) -> None:
13
+ super().__init__()
14
+ self.epitran_model = epitran_model
15
+
16
+ def convertToPhonem(self, sentence: str) -> str:
17
+ phonem_representation = self.epitran_model.transliterate(sentence)
18
+ return phonem_representation
19
+
20
+
21
+ class EngPhonemConverter(ModelInterfaces.ITextToPhonemModel):
22
+
23
+ def __init__(self,) -> None:
24
+ super().__init__()
25
+
26
+ def convertToPhonem(self, sentence: str) -> str:
27
+ phonem_representation = eng_to_ipa.convert(sentence)
28
+ phonem_representation = phonem_representation.replace('*','')
29
+ return phonem_representation
WordMatching.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import WordMetrics
2
+ from ortools.sat.python import cp_model
3
+ import numpy as np
4
+ from string import punctuation
5
+ from dtwalign import dtw_from_distance_matrix
6
+ import time
7
+
8
+ offset_blank = 1
9
+ TIME_THRESHOLD_MAPPING = 5.0
10
+
11
+
12
+ def get_word_distance_matrix(words_estimated: list, words_real: list) -> np.array:
13
+ number_of_real_words = len(words_real)
14
+ number_of_estimated_words = len(words_estimated)
15
+
16
+ word_distance_matrix = np.zeros(
17
+ (number_of_estimated_words+offset_blank, number_of_real_words))
18
+ for idx_estimated in range(number_of_estimated_words):
19
+ for idx_real in range(number_of_real_words):
20
+ word_distance_matrix[idx_estimated, idx_real] = WordMetrics.edit_distance_python(
21
+ words_estimated[idx_estimated], words_real[idx_real])
22
+
23
+ if offset_blank == 1:
24
+ for idx_real in range(number_of_real_words):
25
+ word_distance_matrix[number_of_estimated_words,
26
+ idx_real] = len(words_real[idx_real])
27
+ return word_distance_matrix
28
+
29
+
30
+ def get_best_path_from_distance_matrix(word_distance_matrix):
31
+ modelCpp = cp_model.CpModel()
32
+
33
+ number_of_real_words = word_distance_matrix.shape[1]
34
+ number_of_estimated_words = word_distance_matrix.shape[0]-1
35
+
36
+ number_words = np.maximum(number_of_real_words, number_of_estimated_words)
37
+
38
+ estimated_words_order = [modelCpp.NewIntVar(0, int(
39
+ number_words - 1 + offset_blank), 'w%i' % i) for i in range(number_words+offset_blank)]
40
+
41
+ # They are in ascending order
42
+ for word_idx in range(number_words-1):
43
+ modelCpp.Add(
44
+ estimated_words_order[word_idx+1] >= estimated_words_order[word_idx])
45
+
46
+ total_phoneme_distance = 0
47
+ real_word_at_time = {}
48
+ for idx_estimated in range(number_of_estimated_words):
49
+ for idx_real in range(number_of_real_words):
50
+ real_word_at_time[idx_estimated, idx_real] = modelCpp.NewBoolVar(
51
+ 'real_word_at_time'+str(idx_real)+'-'+str(idx_estimated))
52
+ modelCpp.Add(estimated_words_order[idx_estimated] == idx_real).OnlyEnforceIf(
53
+ real_word_at_time[idx_estimated, idx_real])
54
+ total_phoneme_distance += word_distance_matrix[idx_estimated,
55
+ idx_real]*real_word_at_time[idx_estimated, idx_real]
56
+
57
+ # If no word in time, difference is calculated from empty string
58
+ for idx_real in range(number_of_real_words):
59
+ word_has_a_match = modelCpp.NewBoolVar(
60
+ 'word_has_a_match'+str(idx_real))
61
+ modelCpp.Add(sum([real_word_at_time[idx_estimated, idx_real] for idx_estimated in range(
62
+ number_of_estimated_words)]) == 1).OnlyEnforceIf(word_has_a_match)
63
+ total_phoneme_distance += word_distance_matrix[number_of_estimated_words,
64
+ idx_real]*word_has_a_match.Not()
65
+
66
+ # Loss should be minimized
67
+ modelCpp.Minimize(total_phoneme_distance)
68
+
69
+ solver = cp_model.CpSolver()
70
+ solver.parameters.max_time_in_seconds = TIME_THRESHOLD_MAPPING
71
+ status = solver.Solve(modelCpp)
72
+
73
+ mapped_indices = []
74
+ try:
75
+ for word_idx in range(number_words):
76
+ mapped_indices.append(
77
+ (solver.Value(estimated_words_order[word_idx])))
78
+
79
+ return np.array(mapped_indices, dtype=np.int)
80
+ except:
81
+ return []
82
+
83
+
84
+ def get_resulting_string(mapped_indices: np.array, words_estimated: list, words_real: list) -> list:
85
+ mapped_words = []
86
+ mapped_words_indices = []
87
+ WORD_NOT_FOUND_TOKEN = '-'
88
+ number_of_real_words = len(words_real)
89
+ for word_idx in range(number_of_real_words):
90
+ position_of_real_word_indices = np.where(
91
+ mapped_indices == word_idx)[0].astype(np.int)
92
+
93
+ if len(position_of_real_word_indices) == 0:
94
+ mapped_words.append(WORD_NOT_FOUND_TOKEN)
95
+ mapped_words_indices.append(-1)
96
+ continue
97
+
98
+ if len(position_of_real_word_indices) == 1:
99
+ mapped_words.append(
100
+ words_estimated[position_of_real_word_indices[0]])
101
+ mapped_words_indices.append(position_of_real_word_indices[0])
102
+ continue
103
+ # Check which index gives the lowest error
104
+ if len(position_of_real_word_indices) > 1:
105
+ error = 99999
106
+ best_possible_combination = ''
107
+ best_possible_idx = -1
108
+ for single_word_idx in position_of_real_word_indices:
109
+ idx_above_word = single_word_idx >= len(words_estimated)
110
+ if idx_above_word:
111
+ continue
112
+ error_word = WordMetrics.edit_distance_python(
113
+ words_estimated[single_word_idx], words_real[word_idx])
114
+ if error_word < error:
115
+ error = error_word*1
116
+ best_possible_combination = words_estimated[single_word_idx]
117
+ best_possible_idx = single_word_idx
118
+
119
+ mapped_words.append(best_possible_combination)
120
+ mapped_words_indices.append(best_possible_idx)
121
+ continue
122
+
123
+ return mapped_words, mapped_words_indices
124
+
125
+
126
+ def get_best_mapped_words(words_estimated: list, words_real: list) -> list:
127
+
128
+ word_distance_matrix = get_word_distance_matrix(
129
+ words_estimated, words_real)
130
+
131
+ start = time.time()
132
+ mapped_indices = get_best_path_from_distance_matrix(word_distance_matrix)
133
+
134
+ duration_of_mapping = time.time()-start
135
+ # In case or-tools doesn't converge, go to a faster, low-quality solution
136
+ if len(mapped_indices) == 0 or duration_of_mapping > TIME_THRESHOLD_MAPPING+0.5:
137
+ mapped_indices = (dtw_from_distance_matrix(
138
+ word_distance_matrix)).path[:len(words_estimated), 1]
139
+
140
+ mapped_words, mapped_words_indices = get_resulting_string(
141
+ mapped_indices, words_estimated, words_real)
142
+
143
+ return mapped_words, mapped_words_indices
144
+
145
+
146
+ # Faster, but not optimal
147
+ def get_best_mapped_words_dtw(words_estimated: list, words_real: list) -> list:
148
+
149
+ from dtwalign import dtw_from_distance_matrix
150
+ word_distance_matrix = get_word_distance_matrix(
151
+ words_estimated, words_real)
152
+ mapped_indices = dtw_from_distance_matrix(
153
+ word_distance_matrix).path[:-1, 0]
154
+
155
+ mapped_words, mapped_words_indices = get_resulting_string(
156
+ mapped_indices, words_estimated, words_real)
157
+ return mapped_words, mapped_words_indices
158
+
159
+
160
+ def getWhichLettersWereTranscribedCorrectly(real_word, transcribed_word):
161
+ is_leter_correct = [None]*len(real_word)
162
+ for idx, letter in enumerate(real_word):
163
+ if letter == transcribed_word[idx] or letter in punctuation:
164
+ is_leter_correct[idx] = 1
165
+ else:
166
+ is_leter_correct[idx] = 0
167
+ return is_leter_correct
168
+
169
+
170
+ def parseLetterErrorsToHTML(word_real, is_leter_correct):
171
+ word_colored = ''
172
+ correct_color_start = '*'
173
+ correct_color_end = '*'
174
+ wrong_color_start = '-'
175
+ wrong_color_end = '-'
176
+ for idx, letter in enumerate(word_real):
177
+ if is_leter_correct[idx] == 1:
178
+ word_colored += correct_color_start + letter+correct_color_end
179
+ else:
180
+ word_colored += wrong_color_start + letter+wrong_color_end
181
+ return word_colored
WordMetrics.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ # ref from https://gitlab.com/-/snippets/1948157
4
+ # For some variants, look here https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
5
+
6
+ # Pure python
7
+ def edit_distance_python2(a, b):
8
+ # This version is commutative, so as an optimization we force |a|>=|b|
9
+ if len(a) < len(b):
10
+ return edit_distance_python(b, a)
11
+ if len(b) == 0: # Can deal with empty sequences faster
12
+ return len(a)
13
+ # Only two rows are really needed: the one currently filled in, and the previous
14
+ distances = []
15
+ distances.append([i for i in range(len(b)+1)])
16
+ distances.append([0 for _ in range(len(b)+1)])
17
+ # We can prefill the first row:
18
+ costs = [0 for _ in range(3)]
19
+ for i, a_token in enumerate(a, start=1):
20
+ distances[1][0] += 1 # Deals with the first column.
21
+ for j, b_token in enumerate(b, start=1):
22
+ costs[0] = distances[1][j-1] + 1
23
+ costs[1] = distances[0][j] + 1
24
+ costs[2] = distances[0][j-1] + (0 if a_token == b_token else 1)
25
+ distances[1][j] = min(costs)
26
+ # Move to the next row:
27
+ distances[0][:] = distances[1][:]
28
+ return distances[1][len(b)]
29
+
30
+ #https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/
31
+ def edit_distance_python(seq1, seq2):
32
+ size_x = len(seq1) + 1
33
+ size_y = len(seq2) + 1
34
+ matrix = np.zeros ((size_x, size_y))
35
+ for x in range(size_x):
36
+ matrix [x, 0] = x
37
+ for y in range(size_y):
38
+ matrix [0, y] = y
39
+
40
+ for x in range(1, size_x):
41
+ for y in range(1, size_y):
42
+ if seq1[x-1] == seq2[y-1]:
43
+ matrix [x,y] = min(
44
+ matrix[x-1, y] + 1,
45
+ matrix[x-1, y-1],
46
+ matrix[x, y-1] + 1
47
+ )
48
+ else:
49
+ matrix [x,y] = min(
50
+ matrix[x-1,y] + 1,
51
+ matrix[x-1,y-1] + 1,
52
+ matrix[x,y-1] + 1
53
+ )
54
+ #print (matrix)
55
+ return (matrix[size_x - 1, size_y - 1])
data_de_en_2.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30ec872918777a5c766ab26d78f33383f6b36fccb00af7fc5c543bd43c98ffa4
3
+ size 1056086
lambdaGetSample.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import json
4
+ import RuleBasedModels
5
+ import epitran
6
+ import random
7
+ import pickle
8
+
9
+
10
+ class TextDataset():
11
+ def __init__(self, table, language='-'):
12
+ self.table_dataframe = table
13
+ self.number_of_samples = len(table)
14
+ self.language = language
15
+
16
+ def __getitem__(self, idx):
17
+
18
+ if self.language == 'de':
19
+ line = [self.table_dataframe['de_sentence'].iloc[idx]]
20
+ elif self.language == 'en':
21
+ line = [self.table_dataframe['en_sentence'].iloc[idx]]
22
+ else:
23
+ line = [self.table_dataframe['sentence'].iloc[idx]]
24
+ return line
25
+
26
+ def __len__(self):
27
+ return self.number_of_samples
28
+
29
+
30
+ sample_folder = "./"
31
+ lambda_database = {}
32
+ lambda_ipa_converter = {}
33
+
34
+ with open(sample_folder+'data_de_en_2.pickle', 'rb') as handle:
35
+ df = pickle.load(handle)
36
+
37
+ lambda_database['de'] = TextDataset(df, 'de')
38
+ lambda_database['en'] = TextDataset(df, 'en')
39
+ lambda_translate_new_sample = False
40
+ lambda_ipa_converter['de'] = RuleBasedModels.EpitranPhonemConverter(
41
+ epitran.Epitran('deu-Latn'))
42
+ lambda_ipa_converter['en'] = RuleBasedModels.EngPhonemConverter()
43
+
44
+
45
+ def lambda_handler(event, context):
46
+
47
+ body = json.loads(event['body'])
48
+
49
+ category = int(body['category'])
50
+
51
+ language = body['language']
52
+
53
+ sample_in_category = False
54
+
55
+ while(not sample_in_category):
56
+ valid_sequence = False
57
+ while not valid_sequence:
58
+ try:
59
+ sample_idx = random.randint(0, len(lambda_database[language]))
60
+ current_transcript = lambda_database[language][
61
+ sample_idx]
62
+ valid_sequence = True
63
+ except:
64
+ pass
65
+
66
+ sentence_category = getSentenceCategory(
67
+ current_transcript[0])
68
+
69
+ sample_in_category = (sentence_category ==
70
+ category) or category == 0
71
+
72
+ translated_trascript = ""
73
+
74
+ current_ipa = lambda_ipa_converter[language].convertToPhonem(
75
+ current_transcript[0])
76
+
77
+ result = {'real_transcript': current_transcript,
78
+ 'ipa_transcript': current_ipa,
79
+ 'transcript_translation': translated_trascript}
80
+
81
+ return json.dumps(result)
82
+
83
+
84
+ def getSentenceCategory(sentence) -> int:
85
+ number_of_words = len(sentence.split())
86
+ categories_word_limits = [0, 8, 20, 100000]
87
+ for category in range(len(categories_word_limits)-1):
88
+ if number_of_words > categories_word_limits[category] and number_of_words <= categories_word_limits[category+1]:
89
+ return category+1
lambdaSpeechToScore.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import json
4
+ import os
5
+ import WordMatching as wm
6
+ import utilsFileIO
7
+ import pronunciationTrainer
8
+ import base64
9
+ import time
10
+ import audioread
11
+ import numpy as np
12
+ from torchaudio.transforms import Resample
13
+
14
+
15
+ trainer_SST_lambda = {}
16
+ trainer_SST_lambda['de'] = pronunciationTrainer.getTrainer("de")
17
+ trainer_SST_lambda['en'] = pronunciationTrainer.getTrainer("en")
18
+
19
+ transform = Resample(orig_freq=48000, new_freq=16000)
20
+
21
+
22
+ def lambda_handler(event, context):
23
+
24
+ data = json.loads(event['body'])
25
+
26
+ real_text = data['title']
27
+ file_bytes = base64.b64decode(
28
+ data['base64Audio'][22:].encode('utf-8'))
29
+ language = data['language']
30
+
31
+ if len(real_text) == 0:
32
+ return {
33
+ 'statusCode': 200,
34
+ 'headers': {
35
+ 'Access-Control-Allow-Headers': '*',
36
+ 'Access-Control-Allow-Credentials': "true",
37
+ 'Access-Control-Allow-Origin': '*',
38
+ 'Access-Control-Allow-Methods': 'OPTIONS,POST,GET'
39
+ },
40
+ 'body': ''
41
+ }
42
+
43
+ start = time.time()
44
+ random_file_name = './'+utilsFileIO.generateRandomString()+'.ogg'
45
+ f = open(random_file_name, 'wb')
46
+ f.write(file_bytes)
47
+ f.close()
48
+ print('Time for saving binary in file: ', str(time.time()-start))
49
+
50
+ start = time.time()
51
+ signal, fs = audioread_load(random_file_name)
52
+
53
+ signal = transform(torch.Tensor(signal)).unsqueeze(0)
54
+
55
+ print('Time for loading .ogg file file: ', str(time.time()-start))
56
+
57
+ result = trainer_SST_lambda[language].processAudioForGivenText(
58
+ signal, real_text)
59
+
60
+ start = time.time()
61
+ os.remove(random_file_name)
62
+ print('Time for deleting file: ', str(time.time()-start))
63
+
64
+ start = time.time()
65
+ real_transcripts_ipa = ' '.join(
66
+ [word[0] for word in result['real_and_transcribed_words_ipa']])
67
+ matched_transcripts_ipa = ' '.join(
68
+ [word[1] for word in result['real_and_transcribed_words_ipa']])
69
+
70
+ real_transcripts = ' '.join(
71
+ [word[0] for word in result['real_and_transcribed_words']])
72
+ matched_transcripts = ' '.join(
73
+ [word[1] for word in result['real_and_transcribed_words']])
74
+
75
+ words_real = real_transcripts.lower().split()
76
+ mapped_words = matched_transcripts.split()
77
+
78
+ is_letter_correct_all_words = ''
79
+ for idx, word_real in enumerate(words_real):
80
+
81
+ mapped_letters, mapped_letters_indices = wm.get_best_mapped_words(
82
+ mapped_words[idx], word_real)
83
+
84
+ is_letter_correct = wm.getWhichLettersWereTranscribedCorrectly(
85
+ word_real, mapped_letters) # , mapped_letters_indices)
86
+
87
+ is_letter_correct_all_words += ''.join([str(is_correct)
88
+ for is_correct in is_letter_correct]) + ' '
89
+
90
+ pair_accuracy_category = ' '.join(
91
+ [str(category) for category in result['pronunciation_categories']])
92
+ print('Time to post-process results: ', str(time.time()-start))
93
+
94
+ res = {'real_transcript': result['recording_transcript'],
95
+ 'ipa_transcript': result['recording_ipa'],
96
+ 'pronunciation_accuracy': str(int(result['pronunciation_accuracy'])),
97
+ 'real_transcripts': real_transcripts, 'matched_transcripts': matched_transcripts,
98
+ 'real_transcripts_ipa': real_transcripts_ipa, 'matched_transcripts_ipa': matched_transcripts_ipa,
99
+ 'pair_accuracy_category': pair_accuracy_category,
100
+ 'start_time': result['start_time'],
101
+ 'end_time': result['end_time'],
102
+ 'is_letter_correct_all_words': is_letter_correct_all_words}
103
+
104
+ return json.dumps(res)
105
+
106
+ # From Librosa
107
+
108
+
109
+ def audioread_load(path, offset=0.0, duration=None, dtype=np.float32):
110
+ """Load an audio buffer using audioread.
111
+
112
+ This loads one block at a time, and then concatenates the results.
113
+ """
114
+
115
+ y = []
116
+ with audioread.audio_open(path) as input_file:
117
+ sr_native = input_file.samplerate
118
+ n_channels = input_file.channels
119
+
120
+ s_start = int(np.round(sr_native * offset)) * n_channels
121
+
122
+ if duration is None:
123
+ s_end = np.inf
124
+ else:
125
+ s_end = s_start + \
126
+ (int(np.round(sr_native * duration)) * n_channels)
127
+
128
+ n = 0
129
+
130
+ for frame in input_file:
131
+ frame = buf_to_float(frame, dtype=dtype)
132
+ n_prev = n
133
+ n = n + len(frame)
134
+
135
+ if n < s_start:
136
+ # offset is after the current frame
137
+ # keep reading
138
+ continue
139
+
140
+ if s_end < n_prev:
141
+ # we're off the end. stop reading
142
+ break
143
+
144
+ if s_end < n:
145
+ # the end is in this frame. crop.
146
+ frame = frame[: s_end - n_prev]
147
+
148
+ if n_prev <= s_start <= n:
149
+ # beginning is in this frame
150
+ frame = frame[(s_start - n_prev):]
151
+
152
+ # tack on the current frame
153
+ y.append(frame)
154
+
155
+ if y:
156
+ y = np.concatenate(y)
157
+ if n_channels > 1:
158
+ y = y.reshape((-1, n_channels)).T
159
+ else:
160
+ y = np.empty(0, dtype=dtype)
161
+
162
+ return y, sr_native
163
+
164
+ # From Librosa
165
+
166
+
167
+ def buf_to_float(x, n_bytes=2, dtype=np.float32):
168
+ """Convert an integer buffer to floating point values.
169
+ This is primarily useful when loading integer-valued wav data
170
+ into numpy arrays.
171
+
172
+ Parameters
173
+ ----------
174
+ x : np.ndarray [dtype=int]
175
+ The integer-valued data buffer
176
+
177
+ n_bytes : int [1, 2, 4]
178
+ The number of bytes per sample in ``x``
179
+
180
+ dtype : numeric type
181
+ The target output type (default: 32-bit float)
182
+
183
+ Returns
184
+ -------
185
+ x_float : np.ndarray [dtype=float]
186
+ The input data buffer cast to floating point
187
+ """
188
+
189
+ # Invert the scale of the data
190
+ scale = 1.0 / float(1 << ((8 * n_bytes) - 1))
191
+
192
+ # Construct the format string
193
+ fmt = "<i{:d}".format(n_bytes)
194
+
195
+ # Rescale and format the data buffer
196
+ return scale * np.frombuffer(x, fmt).astype(dtype)
lambdaTTS.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import models
3
+ import soundfile as sf
4
+ import json
5
+ import AIModels
6
+ #from flask import Response
7
+ import utilsFileIO
8
+ import os
9
+ import base64
10
+
11
+ sampling_rate = 16000
12
+ model_TTS_lambda = AIModels.NeuralTTS(models.getTTSModel('de'), sampling_rate)
13
+
14
+
15
+ def lambda_handler(event, context):
16
+
17
+ body = json.loads(event['body'])
18
+
19
+ text_string = body['value']
20
+
21
+ linear_factor = 0.2
22
+ audio = model_TTS_lambda.getAudioFromSentence(
23
+ text_string).detach().numpy()*linear_factor
24
+ random_file_name = utilsFileIO.generateRandomString(20)+'.wav'
25
+
26
+ sf.write('./'+random_file_name, audio, 16000)
27
+
28
+ with open(random_file_name, "rb") as f:
29
+ audio_byte_array = f.read()
30
+
31
+ os.remove(random_file_name)
32
+
33
+
34
+ return {
35
+ 'statusCode': 200,
36
+ 'headers': {
37
+ 'Access-Control-Allow-Headers': '*',
38
+ 'Access-Control-Allow-Origin': '*',
39
+ 'Access-Control-Allow-Methods': 'OPTIONS,POST,GET'
40
+ },
41
+ 'body': json.dumps(
42
+ {
43
+ "wavBase64": str(base64.b64encode(audio_byte_array))[2:-1],
44
+ },
45
+ )
46
+ }
models.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ import pickle
5
+
6
+
7
+ import pickle
8
+
9
+
10
+ def getASRModel(language: str) -> nn.Module:
11
+
12
+ if language == 'de':
13
+
14
+ model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
15
+ model='silero_stt',
16
+ language='de',
17
+ device=torch.device('cpu'))
18
+
19
+ elif language == 'en':
20
+ model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
21
+ model='silero_stt',
22
+ language='en',
23
+ device=torch.device('cpu'))
24
+ elif language == 'fr':
25
+ model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
26
+ model='silero_stt',
27
+ language='fr',
28
+ device=torch.device('cpu'))
29
+
30
+ return (model, decoder)
31
+
32
+
33
+ def getTTSModel(language: str) -> nn.Module:
34
+
35
+ if language == 'de':
36
+
37
+ speaker = 'thorsten_v2' # 16 kHz
38
+ model, _ = torch.hub.load(repo_or_dir='snakers4/silero-models',
39
+ model='silero_tts',
40
+ language=language,
41
+ speaker=speaker)
42
+
43
+ elif language == 'en':
44
+ speaker = 'lj_16khz' # 16 kHz
45
+ model = torch.hub.load(repo_or_dir='snakers4/silero-models',
46
+ model='silero_tts',
47
+ language=language,
48
+ speaker=speaker)
49
+ else:
50
+ raise ValueError('Language not implemented')
51
+
52
+ return model
53
+
54
+
55
+ def getTranslationModel(language: str) -> nn.Module:
56
+ from transformers import AutoTokenizer
57
+ from transformers import AutoModelForSeq2SeqLM
58
+ if language == 'de':
59
+ model = AutoModelForSeq2SeqLM.from_pretrained(
60
+ "Helsinki-NLP/opus-mt-de-en")
61
+ tokenizer = AutoTokenizer.from_pretrained(
62
+ "Helsinki-NLP/opus-mt-de-en")
63
+ # Cache models to avoid Hugging face processing
64
+ with open('translation_model_de.pickle', 'wb') as handle:
65
+ pickle.dump(model, handle)
66
+ with open('translation_tokenizer_de.pickle', 'wb') as handle:
67
+ pickle.dump(tokenizer, handle)
68
+ else:
69
+ raise ValueError('Language not implemented')
70
+
71
+ return model, tokenizer
pronunciationTrainer.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import numpy as np
4
+ import models as mo
5
+ import WordMetrics
6
+ import WordMatching as wm
7
+ import epitran
8
+ import ModelInterfaces as mi
9
+ import AIModels
10
+ import RuleBasedModels
11
+ from string import punctuation
12
+ import time
13
+
14
+
15
+ def getTrainer(language: str):
16
+
17
+ device = torch.device('cpu')
18
+
19
+ model, decoder = mo.getASRModel(language)
20
+ model = model.to(device)
21
+ model.eval()
22
+ asr_model = AIModels.NeuralASR(model, decoder)
23
+
24
+ if language == 'de':
25
+ phonem_converter = RuleBasedModels.EpitranPhonemConverter(
26
+ epitran.Epitran('deu-Latn'))
27
+ elif language == 'en':
28
+ phonem_converter = RuleBasedModels.EngPhonemConverter()
29
+ else:
30
+ raise ValueError('Language not implemented')
31
+
32
+ trainer = PronunciationTrainer(
33
+ asr_model, phonem_converter)
34
+
35
+ return trainer
36
+
37
+
38
+ class PronunciationTrainer:
39
+ current_transcript: str
40
+ current_ipa: str
41
+
42
+ current_recorded_audio: torch.Tensor
43
+ current_recorded_transcript: str
44
+ current_recorded_word_locations: list
45
+ current_recorded_intonations: torch.tensor
46
+ current_words_pronunciation_accuracy = []
47
+ categories_thresholds = np.array([80, 60, 59])
48
+
49
+ sampling_rate = 16000
50
+
51
+ def __init__(self, asr_model: mi.IASRModel, word_to_ipa_coverter: mi.ITextToPhonemModel) -> None:
52
+ self.asr_model = asr_model
53
+ self.ipa_converter = word_to_ipa_coverter
54
+
55
+ def getTranscriptAndWordsLocations(self, audio_length_in_samples: int):
56
+
57
+ audio_transcript = self.asr_model.getTranscript()
58
+ word_locations_in_samples = self.asr_model.getWordLocations()
59
+
60
+ fade_duration_in_samples = 0.05*self.sampling_rate
61
+ word_locations_in_samples = [(int(np.maximum(0, word['start_ts']-fade_duration_in_samples)), int(np.minimum(
62
+ audio_length_in_samples-1, word['end_ts']+fade_duration_in_samples))) for word in word_locations_in_samples]
63
+
64
+ return audio_transcript, word_locations_in_samples
65
+
66
+ def getWordsRelativeIntonation(self, Audio: torch.tensor, word_locations: list):
67
+ intonations = torch.zeros((len(word_locations), 1))
68
+ intonation_fade_samples = 0.3*self.sampling_rate
69
+ print(intonations.shape)
70
+ for word in range(len(word_locations)):
71
+ intonation_start = int(np.maximum(
72
+ 0, word_locations[word][0]-intonation_fade_samples))
73
+ intonation_end = int(np.minimum(
74
+ Audio.shape[1]-1, word_locations[word][1]+intonation_fade_samples))
75
+ intonations[word] = torch.sqrt(torch.mean(
76
+ Audio[0][intonation_start:intonation_end]**2))
77
+
78
+ intonations = intonations/torch.mean(intonations)
79
+ return intonations
80
+
81
+ ##################### ASR Functions ###########################
82
+
83
+ def processAudioForGivenText(self, recordedAudio: torch.Tensor = None, real_text=None):
84
+
85
+ start = time.time()
86
+ recording_transcript, recording_ipa, word_locations = self.getAudioTranscript(
87
+ recordedAudio)
88
+ print('Time for NN to transcript audio: ', str(time.time()-start))
89
+
90
+ start = time.time()
91
+ real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices = self.matchSampleAndRecordedWords(
92
+ real_text, recording_transcript)
93
+ print('Time for matching transcripts: ', str(time.time()-start))
94
+
95
+ start_time, end_time = self.getWordLocationsFromRecordInSeconds(
96
+ word_locations, mapped_words_indices)
97
+
98
+ pronunciation_accuracy, current_words_pronunciation_accuracy = self.getPronunciationAccuracy(
99
+ real_and_transcribed_words) # _ipa
100
+
101
+ pronunciation_categories = self.getWordsPronunciationCategory(
102
+ current_words_pronunciation_accuracy)
103
+
104
+ result = {'recording_transcript': recording_transcript,
105
+ 'real_and_transcribed_words': real_and_transcribed_words,
106
+ 'recording_ipa': recording_ipa, 'start_time': start_time, 'end_time': end_time,
107
+ 'real_and_transcribed_words_ipa': real_and_transcribed_words_ipa, 'pronunciation_accuracy': pronunciation_accuracy,
108
+ 'pronunciation_categories': pronunciation_categories}
109
+
110
+ return result
111
+
112
+ def getAudioTranscript(self, recordedAudio: torch.Tensor = None):
113
+ current_recorded_audio = recordedAudio
114
+
115
+ current_recorded_audio = self.preprocessAudio(
116
+ current_recorded_audio)
117
+
118
+ self.asr_model.processAudio(current_recorded_audio)
119
+
120
+ current_recorded_transcript, current_recorded_word_locations = self.getTranscriptAndWordsLocations(
121
+ current_recorded_audio.shape[1])
122
+ current_recorded_ipa = self.ipa_converter.convertToPhonem(
123
+ current_recorded_transcript)
124
+
125
+ return current_recorded_transcript, current_recorded_ipa, current_recorded_word_locations
126
+
127
+ def getWordLocationsFromRecordInSeconds(self, word_locations, mapped_words_indices) -> list:
128
+ start_time = []
129
+ end_time = []
130
+ for word_idx in range(len(mapped_words_indices)):
131
+ start_time.append(float(word_locations[mapped_words_indices[word_idx]]
132
+ [0])/self.sampling_rate)
133
+ end_time.append(float(word_locations[mapped_words_indices[word_idx]]
134
+ [1])/self.sampling_rate)
135
+ return ' '.join([str(time) for time in start_time]), ' '.join([str(time) for time in end_time])
136
+
137
+ ##################### END ASR Functions ###########################
138
+
139
+ ##################### Evaluation Functions ###########################
140
+ def matchSampleAndRecordedWords(self, real_text, recorded_transcript):
141
+ words_estimated = recorded_transcript.split()
142
+
143
+ if real_text is None:
144
+ words_real = self.current_transcript[0].split()
145
+ else:
146
+ words_real = real_text.split()
147
+
148
+ mapped_words, mapped_words_indices = wm.get_best_mapped_words(
149
+ words_estimated, words_real)
150
+
151
+ real_and_transcribed_words = []
152
+ real_and_transcribed_words_ipa = []
153
+ for word_idx in range(len(words_real)):
154
+ if word_idx >= len(mapped_words)-1:
155
+ mapped_words.append('-')
156
+ real_and_transcribed_words.append(
157
+ (words_real[word_idx], mapped_words[word_idx]))
158
+ real_and_transcribed_words_ipa.append((self.ipa_converter.convertToPhonem(words_real[word_idx]),
159
+ self.ipa_converter.convertToPhonem(mapped_words[word_idx])))
160
+ return real_and_transcribed_words, real_and_transcribed_words_ipa, mapped_words_indices
161
+
162
+ def getPronunciationAccuracy(self, real_and_transcribed_words_ipa) -> float:
163
+ total_mismatches = 0.
164
+ number_of_phonemes = 0.
165
+ current_words_pronunciation_accuracy = []
166
+ for pair in real_and_transcribed_words_ipa:
167
+
168
+ real_without_punctuation = self.removePunctuation(pair[0]).lower()
169
+ number_of_word_mismatches = WordMetrics.edit_distance_python(
170
+ real_without_punctuation, self.removePunctuation(pair[1]).lower())
171
+ total_mismatches += number_of_word_mismatches
172
+ number_of_phonemes_in_word = len(real_without_punctuation)
173
+ number_of_phonemes += number_of_phonemes_in_word
174
+
175
+ current_words_pronunciation_accuracy.append(float(
176
+ number_of_phonemes_in_word-number_of_word_mismatches)/number_of_phonemes_in_word*100)
177
+
178
+ percentage_of_correct_pronunciations = (
179
+ number_of_phonemes-total_mismatches)/number_of_phonemes*100
180
+
181
+ return np.round(percentage_of_correct_pronunciations), current_words_pronunciation_accuracy
182
+
183
+ def removePunctuation(self, word: str) -> str:
184
+ return ''.join([char for char in word if char not in punctuation])
185
+
186
+ def getWordsPronunciationCategory(self, accuracies) -> list:
187
+ categories = []
188
+
189
+ for accuracy in accuracies:
190
+ categories.append(
191
+ self.getPronunciationCategoryFromAccuracy(accuracy))
192
+
193
+ return categories
194
+
195
+ def getPronunciationCategoryFromAccuracy(self, accuracy) -> int:
196
+ return np.argmin(abs(self.categories_thresholds-accuracy))
197
+
198
+ def preprocessAudio(self, audio: torch.tensor) -> torch.tensor:
199
+ audio = audio-torch.mean(audio)
200
+ audio = audio/torch.max(torch.abs(audio))
201
+ return audio
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -f https://download.pytorch.org/whl/torch_stable.html
2
+ torch==1.10.1
3
+ torchaudio==0.10.1
4
+ soundfile==0.10.3.post1
5
+ omegaconf
6
+ epitran==1.15
7
+ audioread
8
+ requests
9
+ dtwalign
10
+ eng_to_ipa
11
+ pandas
12
+ flask
13
+ flask_cors
14
+ pickle-mixin
15
+ sqlalchemy
16
+ transformers
17
+ sentencepiece
18
+ ortools==9.2.9972
static/.DS_Store ADDED
Binary file (8.2 kB). View file
 
static/ASR_bad.wav ADDED
Binary file (425 kB). View file
 
static/ASR_good.wav ADDED
Binary file (425 kB). View file
 
static/ASR_okay.wav ADDED
Binary file (425 kB). View file
 
static/css/.DS_Store ADDED
Binary file (6.15 kB). View file
 
static/css/style-new.css ADDED
@@ -0,0 +1,471 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ background: #f2f2f2;
3
+ }
4
+
5
+
6
+ .expanded {
7
+ margin: auto;
8
+ align-content: center;
9
+ }
10
+
11
+ p {
12
+ overflow: auto;
13
+ }
14
+
15
+ h1 {
16
+ margin-left: 2%;
17
+ }
18
+
19
+ a.disabled {
20
+ pointer-events: none;
21
+ color: #ccc;
22
+ background-color: #ccc;
23
+ }
24
+
25
+
26
+ .horizontal-flexbox {
27
+ height: 100%;
28
+ width: 100%;
29
+ display: flex;
30
+ }
31
+
32
+ /* ############## Next button ##### */
33
+ .button-next {
34
+ border-radius: 4px;
35
+ display: block;
36
+ border: none;
37
+ color: #FFFFFF;
38
+ text-align: left;
39
+ font-size: 3em;
40
+ box-sizing: border-box;
41
+ position: absolute;
42
+ top: 0;
43
+ left: 0%;
44
+ right: 2%;
45
+ bottom: 2%;
46
+ background-color: #58636d;
47
+ width: 10em;
48
+
49
+ transition: all 0.5s;
50
+ cursor: pointer;
51
+ }
52
+
53
+ .button-next:hover {
54
+ background-color: #6383a1 !important;
55
+ }
56
+
57
+ .button-next span {
58
+ cursor: pointer;
59
+ display: inline-block;
60
+ position: relative;
61
+ transition: 0.5s;
62
+ }
63
+
64
+ /*
65
+ .button-next span:after {
66
+ content: '\00bb';
67
+ position: absolute;
68
+ opacity: 0;
69
+ top: 0;
70
+ right: -20px;
71
+ transition: 0.5s;
72
+
73
+ }*/
74
+
75
+ .button-next:hover span {
76
+ padding-right: 25px;
77
+ }
78
+
79
+ .button-next:hover span:after {
80
+ opacity: 1;
81
+ right: 0;
82
+ }
83
+
84
+
85
+
86
+ /* ############# Texts ############## */
87
+
88
+ .main-text {
89
+ font-size: 2.5em;
90
+ max-width: 87%;
91
+ }
92
+
93
+ .ipa-text {
94
+ font-size: 1.8em;
95
+ max-width: 87%;
96
+ }
97
+
98
+ .ipa-text-small {
99
+ font-size: 1.5em;
100
+ }
101
+
102
+ .accuracy-text {
103
+ /*font-family: "Dank Mono", ui-monospace, monospace;*/
104
+ background: linear-gradient(to right,
105
+ rgb(54, 56, 80),
106
+ rgb(21, 60, 87));
107
+ background-clip: text;
108
+ -webkit-background-clip: text;
109
+ -webkit-text-fill-color: transparent;
110
+ text-align: center;
111
+ font-size: 2em;
112
+ margin-left: 2%;
113
+ left: 0%;
114
+ }
115
+
116
+ .main-text-div {
117
+ overflow-y: auto;
118
+ position: absolute;
119
+ left: 10%;
120
+ right: 10%;
121
+ top: 2%;
122
+ bottom: 2%;
123
+ }
124
+
125
+ /* ############# Card Container ############## */
126
+ .container {
127
+ display: block;
128
+ position: absolute;
129
+ left: 2%;
130
+ top: 18%;
131
+ transform: translate(-0%, -0%);
132
+ height: 59%;
133
+ width: 96%;
134
+ max-width: 96%;
135
+ background: #ffff;
136
+ overflow: hidden;
137
+ border-radius: 20px;
138
+ box-shadow: 0 0 20px 8px #d0d0d0;
139
+ }
140
+
141
+ .container-small {
142
+ position: fixed;
143
+ left: 68%;
144
+ top: 79%;
145
+ transform: translate(-0%, -0%);
146
+ height: 7%;
147
+ width: 30%;
148
+ background: #ffff;
149
+ overflow: hidden;
150
+ border-radius: 20px;
151
+ box-shadow: 0 0 20px 8px #d0d0d0;
152
+ }
153
+
154
+ /* ############# Icon Button ############## */
155
+
156
+ .round-button {
157
+ box-sizing: border-box;
158
+ display: block;
159
+ width: 3em;
160
+ /* 80px */
161
+ height: 3em;
162
+ left: 0%;
163
+ padding-top: 14px;
164
+ padding-left: 0px;
165
+ line-height: 0px;
166
+ border: 6px solid #fff;
167
+ border-radius: 50%;
168
+ color: #f5f5f5;
169
+ text-align: center;
170
+ text-decoration: none;
171
+ background-color: #467387;
172
+ font-size: 20px;
173
+ font-weight: bold;
174
+ transition: all 0.3s ease;
175
+ }
176
+
177
+ .round-button:hover {
178
+ background-color: rgba(0, 0, 0, 0.8);
179
+ box-shadow: 0px 0px 10px #61a4d4;
180
+ text-shadow: 0px 0px 10px #61a4d4;
181
+ }
182
+
183
+ .icon-text {
184
+ font-size: 1em !important;
185
+ text-align: center;
186
+ }
187
+
188
+ .round-button-mic {
189
+ box-sizing: border-box;
190
+ display: block;
191
+ width: 4.5em;
192
+ /* 80px */
193
+ height: 4.5em;
194
+ padding-top: 14px;
195
+ padding-left: -2.25em;
196
+ line-height: 0px;
197
+ border: 6px solid #fff;
198
+ border-radius: 50%;
199
+ color: #f5f5f5;
200
+ text-align: center;
201
+ text-decoration: none;
202
+ background-color: #49d67d;
203
+ /*#467387;*/
204
+ font-size: 20px;
205
+ font-weight: bold;
206
+ transition: all 0.3s ease;
207
+ }
208
+
209
+ .round-button-mic:hover {
210
+ background-color: #477c5b;
211
+ /*rgba(0,0,0,0.8);*/
212
+ box-shadow: 0px 0px 10px #61a4d4;
213
+ text-shadow: 0px 0px 10px #61a4d4;
214
+ }
215
+
216
+ .icon-text-mic {
217
+ font-size: 2.5em !important;
218
+ }
219
+
220
+ .icon-text-home {
221
+ font-size: 3.5em !important;
222
+ }
223
+
224
+ .mic-button-div {
225
+ position: fixed;
226
+ left: 50%;
227
+ top: 80%
228
+ }
229
+
230
+ /*############### Drop-down ############# */
231
+ .dropbtn {
232
+ background-color: #ffffff;
233
+ color: rgb(50, 71, 165);
234
+ padding: 0px;
235
+ font-size: 16px;
236
+ border: none;
237
+ }
238
+
239
+ .dropdown {
240
+ position: relative;
241
+ display: inline-block;
242
+ }
243
+
244
+ .dropdown-content {
245
+ display: none;
246
+ position: absolute;
247
+ background-color: #ffffff;
248
+ min-width: 160px;
249
+ box-shadow: 0px 8px 16px 0px rgba(0, 0, 0, 0.2);
250
+ z-index: 1;
251
+ }
252
+
253
+ .dropdown-content a {
254
+ color: black;
255
+ padding: 12px 16px;
256
+ text-decoration: none;
257
+ display: block;
258
+ }
259
+
260
+ .dropdown-content a:hover {
261
+ background-color: #ddd;
262
+ }
263
+
264
+ .dropdown:hover .dropdown-content {
265
+ display: block;
266
+ }
267
+
268
+ .dropdown:hover .dropbtn {
269
+ background-color: #3e8e41;
270
+ }
271
+
272
+ /* ############# Arrow ############## position: relative; position: absolute;*/
273
+ .load-more {
274
+ position: fixed;
275
+ cursor: pointer;
276
+ width: 100px;
277
+ height: 100px;
278
+ margin: -0px 0 0 -0px;
279
+ min-width: 10px;
280
+ min-height: 10px;
281
+
282
+ left: 90%;
283
+ top: 45%;
284
+ border-width: 2px;
285
+ border-style: solid;
286
+ border-color: transparent;
287
+ border-bottom-color: #000;
288
+ border-right-color: #000;
289
+ border-radius: 0 0 5px 0;
290
+
291
+ transform: translate(-0%, -0%) rotate(-45deg);
292
+ }
293
+
294
+ /* ######## Radio Buttons ############## */
295
+ .radio {
296
+ background: #f6f7fd;
297
+ padding: 4px;
298
+ border-radius: 3px;
299
+ box-shadow: inset 0 0 0 3px rgba(35, 33, 45, 0.3),
300
+ 0 0 0 3px rgba(185, 185, 185, 0.3);
301
+ position: relative;
302
+ }
303
+
304
+ .radio input {
305
+ width: max-content;
306
+ height: 100%;
307
+ appearance: none;
308
+ outline: none;
309
+ cursor: pointer;
310
+ border-radius: 2px;
311
+ padding: 4px 8px;
312
+ background: #454857;
313
+ color: #bdbdbdbd;
314
+ font-size: 0.8em;
315
+ font-family: system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
316
+ "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji",
317
+ "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
318
+ transition: all 100ms linear;
319
+ }
320
+
321
+ .radio input:checked {
322
+ background-image: linear-gradient(180deg, #4e70ce, #5197d8);
323
+ color: #fff;
324
+ box-shadow: 0 1px 1px #0000002e;
325
+ text-shadow: 0 1px 0px #79485f7a;
326
+ }
327
+
328
+ .radio input:before {
329
+ content: attr(label);
330
+ display: inline-block;
331
+ text-align: center;
332
+ width: 100%;
333
+ }
334
+
335
+ /* ############ Links and credits ####*/
336
+
337
+ .link-icon-div {
338
+ position: fixed;
339
+ left: 90.0%;
340
+ top: 0.0%;
341
+ vertical-align: middle;
342
+ align-content: flex-start;
343
+ }
344
+
345
+ .credits-icon-div {
346
+ position: fixed;
347
+ left: 90.5%;
348
+ top: 95%;
349
+ font-size: x-small;
350
+ }
351
+
352
+ .svg-icon {
353
+ padding-top: 1em;
354
+ width: 50px;
355
+ height: 50px;
356
+
357
+ }
358
+
359
+
360
+ /* ######## Switch ############## */
361
+ @media only screen and (max-width: 1200px) {
362
+ .round-button {
363
+ box-sizing: border-box;
364
+ display: block;
365
+ width: 2em;
366
+ /* 80px */
367
+ height: 2em;
368
+ left: -2.5%;
369
+ padding-top: 0.3em;
370
+ padding-left: 0px;
371
+ line-height: 0px;
372
+ border: 6px solid #fff;
373
+ border-radius: 50%;
374
+ color: #f5f5f5;
375
+ text-align: center;
376
+ text-decoration: none;
377
+ background-color: #467387;
378
+ font-size: 1em;
379
+ font-weight: bold;
380
+ transition: all 0.3s ease;
381
+ }
382
+
383
+ .container {
384
+ display: block;
385
+ position: absolute;
386
+ left: 2%;
387
+ top: 22%;
388
+ transform: translate(-0%, -0%);
389
+ height: 55%;
390
+ width: 96%;
391
+ max-width: 96%;
392
+ background: #ffff;
393
+ overflow: hidden;
394
+ border-radius: 20px;
395
+ box-shadow: 0 0 20px 8px #d0d0d0;
396
+ }
397
+
398
+ .icon-text {
399
+ font-size: 0.8em !important;
400
+ text-align: center;
401
+ }
402
+
403
+ .ipa-text-small {
404
+ font-size: small;
405
+ }
406
+
407
+ .round-button-mic {
408
+ box-sizing: border-box;
409
+ display: block;
410
+ width: 3.5em;
411
+ /* 80px */
412
+ height: 3.5em;
413
+ padding-top: 0.4em;
414
+ left: 40%;
415
+ line-height: 0px;
416
+ border: 6px solid #fff;
417
+ border-radius: 50%;
418
+ color: #f5f5f5;
419
+ text-align: center;
420
+ text-decoration: none;
421
+ background-color: #49d67d;
422
+ font-size: 20px;
423
+ font-weight: bold;
424
+ transition: all 0.3s ease;
425
+ }
426
+
427
+ .mic-button-div {
428
+ position: fixed;
429
+ left: 40%;
430
+ top: 80%
431
+ }
432
+
433
+ .link-icon-div {
434
+ position: fixed;
435
+ left: 89.0%;
436
+ top: 0.0%;
437
+ vertical-align: middle;
438
+ }
439
+
440
+ .credits-icon-div {
441
+ position: fixed;
442
+ left: 78.5%;
443
+ top: 95%;
444
+ font-size: x-small;
445
+ }
446
+
447
+ .svg-icon {
448
+ padding-top: 1em;
449
+ width: 40px;
450
+ height: 40px;
451
+ }
452
+
453
+
454
+ .icon-text-home {
455
+ font-size: 2.5em !important;
456
+ }
457
+
458
+ .accuracy-text {
459
+ font-family: "Dank Mono", ui-monospace, monospace;
460
+ background: linear-gradient(to right,
461
+ rgb(54, 56, 80),
462
+ rgb(21, 60, 87));
463
+ left: -5.0%;
464
+ background-clip: text;
465
+ -webkit-background-clip: text;
466
+ -webkit-text-fill-color: transparent;
467
+ text-align: center;
468
+ font-size: 0.8em;
469
+ }
470
+
471
+ }
static/javascript/callbacks.js ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ // Audio context initialization
4
+ let mediaRecorder, audioChunks, audioBlob, stream, audioRecorded;
5
+ const ctx = new AudioContext();
6
+ let currentAudioForPlaying;
7
+ let lettersOfWordAreCorrect = [];
8
+
9
+ // UI-related variables
10
+ const page_title = "AI Pronunciation Trainer";
11
+ const accuracy_colors = ["green", "orange", "red"];
12
+ let badScoreThreshold = 30;
13
+ let mediumScoreThreshold = 70;
14
+ let currentSample = 0;
15
+ let currentScore = 0.;
16
+ let sample_difficult = 0;
17
+ let scoreMultiplier = 1;
18
+ let playAnswerSounds = true;
19
+ let isNativeSelectedForPlayback = true;
20
+ let isRecording = false;
21
+ let serverIsInitialized = false;
22
+ let serverWorking = true;
23
+ let languageFound = true;
24
+ let currentSoundRecorded = false;
25
+ let currentText, currentIpa, real_transcripts_ipa, matched_transcripts_ipa;
26
+ let wordCategories;
27
+ let startTime, endTime;
28
+
29
+ // API related variables
30
+ let AILanguage = "de"; // Standard is German
31
+
32
+
33
+ let STScoreAPIKey = 'rll5QsTiv83nti99BW6uCmvs9BDVxSB39SVFceYb'; // Public Key. If, for some reason, you would like a private one, send-me a message and we can discuss some possibilities
34
+ let apiMainPathSample = '';// 'http://127.0.0.1:3001';// 'https://a3hj0l2j2m.execute-api.eu-central-1.amazonaws.com/Prod';
35
+ let apiMainPathSTS = '';// 'https://wrg7ayuv7i.execute-api.eu-central-1.amazonaws.com/Prod';
36
+
37
+
38
+ // Variables to playback accuracy sounds
39
+ let soundsPath = '../static';//'https://stscore-sounds-bucket.s3.eu-central-1.amazonaws.com';
40
+ let soundFileGood = null;
41
+ let soundFileOkay = null;
42
+ let soundFileBad = null;
43
+
44
+ // Speech generation
45
+ var synth = window.speechSynthesis;
46
+ let voice_idx = 0;
47
+ let voice_synth = null;
48
+
49
+ //############################ UI general control functions ###################
50
+ const unblockUI = () => {
51
+ document.getElementById("recordAudio").classList.remove('disabled');
52
+ document.getElementById("playSampleAudio").classList.remove('disabled');
53
+ document.getElementById("buttonNext").onclick = () => getNextSample();
54
+ document.getElementById("nextButtonDiv").classList.remove('disabled');
55
+ document.getElementById("original_script").classList.remove('disabled');
56
+ document.getElementById("buttonNext").style["background-color"] = '#58636d';
57
+
58
+ if (currentSoundRecorded)
59
+ document.getElementById("playRecordedAudio").classList.remove('disabled');
60
+
61
+
62
+ };
63
+
64
+ const blockUI = () => {
65
+
66
+ document.getElementById("recordAudio").classList.add('disabled');
67
+ document.getElementById("playSampleAudio").classList.add('disabled');
68
+ document.getElementById("buttonNext").onclick = null;
69
+ document.getElementById("original_script").classList.add('disabled');
70
+ document.getElementById("playRecordedAudio").classList.add('disabled');
71
+
72
+ document.getElementById("buttonNext").style["background-color"] = '#adadad';
73
+
74
+
75
+ };
76
+
77
+ const UIError = () => {
78
+ blockUI();
79
+ document.getElementById("buttonNext").onclick = () => getNextSample(); //If error, user can only try to get a new sample
80
+ document.getElementById("buttonNext").style["background-color"] = '#58636d';
81
+
82
+ document.getElementById("recorded_ipa_script").innerHTML = "";
83
+ document.getElementById("single_word_ipa_pair").innerHTML = "Error";
84
+ document.getElementById("ipa_script").innerHTML = "Error"
85
+
86
+ document.getElementById("main_title").innerHTML = 'Server Error';
87
+ document.getElementById("original_script").innerHTML = 'Server error. Either the daily quota of the server is over or there was some internal error. You can try to generate a new sample in a few seconds. If the error persist, try comming back tomorrow or download the local version from Github :)';
88
+ };
89
+
90
+ const UINotSupported = () => {
91
+ unblockUI();
92
+
93
+ document.getElementById("main_title").innerHTML = "Browser unsupported";
94
+
95
+ }
96
+
97
+ const UIRecordingError = () => {
98
+ unblockUI();
99
+ document.getElementById("main_title").innerHTML = "Recording error, please try again or restart page.";
100
+ startMediaDevice();
101
+ }
102
+
103
+
104
+
105
+ //################### Application state functions #######################
106
+ function updateScore(currentPronunciationScore) {
107
+
108
+ if (isNaN(currentPronunciationScore))
109
+ return;
110
+ currentScore += currentPronunciationScore * scoreMultiplier;
111
+ currentScore = Math.round(currentScore);
112
+ }
113
+
114
+ const cacheSoundFiles = async () => {
115
+ await fetch(soundsPath + '/ASR_good.wav').then(data => data.arrayBuffer()).
116
+ then(arrayBuffer => ctx.decodeAudioData(arrayBuffer)).
117
+ then(decodeAudioData => {
118
+ soundFileGood = decodeAudioData;
119
+ });
120
+
121
+ await fetch(soundsPath + '/ASR_okay.wav').then(data => data.arrayBuffer()).
122
+ then(arrayBuffer => ctx.decodeAudioData(arrayBuffer)).
123
+ then(decodeAudioData => {
124
+ soundFileOkay = decodeAudioData;
125
+ });
126
+
127
+ await fetch(soundsPath + '/ASR_bad.wav').then(data => data.arrayBuffer()).
128
+ then(arrayBuffer => ctx.decodeAudioData(arrayBuffer)).
129
+ then(decodeAudioData => {
130
+ soundFileBad = decodeAudioData;
131
+ });
132
+ }
133
+
134
+ const getNextSample = async () => {
135
+
136
+
137
+
138
+ blockUI();
139
+
140
+ if (!serverIsInitialized)
141
+ await initializeServer();
142
+
143
+ if (!serverWorking) {
144
+ UIError();
145
+ return;
146
+ }
147
+
148
+ if (soundFileBad == null)
149
+ cacheSoundFiles();
150
+
151
+
152
+
153
+ updateScore(parseFloat(document.getElementById("pronunciation_accuracy").innerHTML));
154
+
155
+ document.getElementById("main_title").innerHTML = "Processing new sample...";
156
+
157
+
158
+ if (document.getElementById('lengthCat1').checked) {
159
+ sample_difficult = 0;
160
+ scoreMultiplier = 1.3;
161
+ }
162
+ else if (document.getElementById('lengthCat2').checked) {
163
+ sample_difficult = 1;
164
+ scoreMultiplier = 1;
165
+ }
166
+ else if (document.getElementById('lengthCat3').checked) {
167
+ sample_difficult = 2;
168
+ scoreMultiplier = 1.3;
169
+ }
170
+ else if (document.getElementById('lengthCat4').checked) {
171
+ sample_difficult = 3;
172
+ scoreMultiplier = 1.6;
173
+ }
174
+
175
+ try {
176
+ await fetch(apiMainPathSample + '/getSample', {
177
+ method: "post",
178
+ body: JSON.stringify({
179
+ "category": sample_difficult.toString(), "language": AILanguage
180
+ }),
181
+ headers: { "X-Api-Key": STScoreAPIKey }
182
+ }).then(res => res.json()).
183
+ then(data => {
184
+
185
+
186
+
187
+ let doc = document.getElementById("original_script");
188
+ currentText = data.real_transcript;
189
+ doc.innerHTML = currentText;
190
+
191
+ currentIpa = data.ipa_transcript
192
+
193
+ let doc_ipa = document.getElementById("ipa_script");
194
+ doc_ipa.innerHTML = "/ " + currentIpa + " /";
195
+
196
+ document.getElementById("recorded_ipa_script").innerHTML = ""
197
+ document.getElementById("pronunciation_accuracy").innerHTML = "";
198
+ document.getElementById("single_word_ipa_pair").innerHTML = "Reference | Spoken"
199
+ document.getElementById("section_accuracy").innerHTML = "| Score: " + currentScore.toString() + " - (" + currentSample.toString() + ")";
200
+ currentSample += 1;
201
+
202
+ document.getElementById("main_title").innerHTML = page_title;
203
+
204
+ document.getElementById("translated_script").innerHTML = data.transcript_translation;
205
+
206
+ currentSoundRecorded = false;
207
+ unblockUI();
208
+ document.getElementById("playRecordedAudio").classList.add('disabled');
209
+
210
+ })
211
+ }
212
+ catch
213
+ {
214
+ UIError();
215
+ }
216
+
217
+
218
+ };
219
+
220
+ const updateRecordingState = async () => {
221
+ if (isRecording) {
222
+ stopRecording();
223
+ return
224
+ }
225
+ else {
226
+ recordSample()
227
+ return;
228
+ }
229
+ }
230
+
231
+ const generateWordModal = (word_idx) => {
232
+
233
+ document.getElementById("single_word_ipa_pair").innerHTML = wrapWordForPlayingLink(real_transcripts_ipa[word_idx], word_idx, false, "black")
234
+ + ' | ' + wrapWordForPlayingLink(matched_transcripts_ipa[word_idx], word_idx, true, accuracy_colors[parseInt(wordCategories[word_idx])])
235
+ }
236
+
237
+ const recordSample = async () => {
238
+
239
+ document.getElementById("main_title").innerHTML = "Recording... click again when done speaking";
240
+ document.getElementById("recordIcon").innerHTML = 'pause_presentation';
241
+ blockUI();
242
+ document.getElementById("recordAudio").classList.remove('disabled');
243
+ audioChunks = [];
244
+ isRecording = true;
245
+ mediaRecorder.start();
246
+
247
+ }
248
+
249
+ const changeLanguage = (language, generateNewSample = false) => {
250
+ voices = synth.getVoices();
251
+ AILanguage = language;
252
+ languageFound = false;
253
+ let languageIdentifier, languageName;
254
+ switch (language) {
255
+ case 'de':
256
+
257
+ document.getElementById("languageBox").innerHTML = "German";
258
+ languageIdentifier = 'de';
259
+ languageName = 'Anna';
260
+ break;
261
+
262
+ case 'en':
263
+
264
+ document.getElementById("languageBox").innerHTML = "English";
265
+ languageIdentifier = 'en';
266
+ languageName = 'Daniel';
267
+ break;
268
+ };
269
+
270
+ for (idx = 0; idx < voices.length; idx++) {
271
+ if (voices[idx].lang.slice(0, 2) == languageIdentifier && voices[idx].name == languageName) {
272
+ voice_synth = voices[idx];
273
+ languageFound = true;
274
+ break;
275
+ }
276
+
277
+ }
278
+ // If specific voice not found, search anything with the same language
279
+ if (!languageFound) {
280
+ for (idx = 0; idx < voices.length; idx++) {
281
+ if (voices[idx].lang.slice(0, 2) == languageIdentifier) {
282
+ voice_synth = voices[idx];
283
+ languageFound = true;
284
+ break;
285
+ }
286
+ }
287
+ }
288
+ if (generateNewSample)
289
+ getNextSample();
290
+ }
291
+
292
+ //################### Speech-To-Score function ########################
293
+ const mediaStreamConstraints = {
294
+ audio: {
295
+ channelCount: 1,
296
+ sampleRate: 48000
297
+ }
298
+ }
299
+
300
+
301
+ const startMediaDevice = () => {
302
+ navigator.mediaDevices.getUserMedia(mediaStreamConstraints).then(_stream => {
303
+ stream = _stream
304
+ mediaRecorder = new MediaRecorder(stream);
305
+
306
+ let currentSamples = 0
307
+ mediaRecorder.ondataavailable = event => {
308
+
309
+ currentSamples += event.data.length
310
+ audioChunks.push(event.data);
311
+ };
312
+
313
+ mediaRecorder.onstop = async () => {
314
+
315
+
316
+ document.getElementById("recordIcon").innerHTML = 'mic';
317
+ blockUI();
318
+
319
+
320
+ audioBlob = new Blob(audioChunks, { type: 'audio/ogg;' });
321
+
322
+ let audioUrl = URL.createObjectURL(audioBlob);
323
+ audioRecorded = new Audio(audioUrl);
324
+
325
+ let audioBase64 = await convertBlobToBase64(audioBlob);
326
+
327
+ let minimumAllowedLength = 6;
328
+ if (audioBase64.length < minimumAllowedLength) {
329
+ setTimeout(UIRecordingError, 50); // Make sure this function finished after get called again
330
+ return;
331
+ }
332
+
333
+ try {
334
+ await fetch(apiMainPathSTS + '/GetAccuracyFromRecordedAudio', {
335
+ method: "post",
336
+ body: JSON.stringify({ "title": currentText[0], "base64Audio": audioBase64, "language": AILanguage }),
337
+ headers: { "X-Api-Key": STScoreAPIKey }
338
+
339
+ }).then(res => res.json()).
340
+ then(data => {
341
+
342
+ if (playAnswerSounds)
343
+ playSoundForAnswerAccuracy(parseFloat(data.pronunciation_accuracy))
344
+
345
+ document.getElementById("recorded_ipa_script").innerHTML = "/ " + data.ipa_transcript + " /";
346
+ document.getElementById("recordAudio").classList.add('disabled');
347
+ document.getElementById("main_title").innerHTML = page_title;
348
+ document.getElementById("pronunciation_accuracy").innerHTML = data.pronunciation_accuracy + "%";
349
+
350
+ lettersOfWordAreCorrect = data.is_letter_correct_all_words.split(" ")
351
+
352
+
353
+ startTime = data.start_time;
354
+ endTime = data.end_time;
355
+
356
+
357
+ real_transcripts_ipa = data.real_transcripts_ipa.split(" ")
358
+ matched_transcripts_ipa = data.matched_transcripts_ipa.split(" ")
359
+ wordCategories = data.pair_accuracy_category.split(" ")
360
+ let currentTextWords = currentText[0].split(" ")
361
+
362
+ coloredWords = "";
363
+ for (let word_idx = 0; word_idx < currentTextWords.length; word_idx++) {
364
+
365
+ wordTemp = '';
366
+ for (let letter_idx = 0; letter_idx < currentTextWords[word_idx].length; letter_idx++) {
367
+ letter_is_correct = lettersOfWordAreCorrect[word_idx][letter_idx] == '1'
368
+ if (letter_is_correct)
369
+ color_letter = 'green'
370
+ else
371
+ color_letter = 'red'
372
+
373
+ wordTemp += '<font color=' + color_letter + '>' + currentTextWords[word_idx][letter_idx] + "</font>"
374
+ }
375
+ currentTextWords[word_idx]
376
+ coloredWords += " " + wrapWordForIndividualPlayback(wordTemp, word_idx)
377
+ }
378
+
379
+
380
+
381
+ document.getElementById("original_script").innerHTML = coloredWords
382
+
383
+ currentSoundRecorded = true;
384
+ unblockUI();
385
+ document.getElementById("playRecordedAudio").classList.remove('disabled');
386
+
387
+ });
388
+ }
389
+ catch {
390
+ UIError();
391
+ }
392
+ };
393
+
394
+ });
395
+ };
396
+ startMediaDevice();
397
+
398
+ // ################### Audio playback ##################
399
+ const playSoundForAnswerAccuracy = async (accuracy) => {
400
+
401
+ currentAudioForPlaying = soundFileGood;
402
+ if (accuracy < mediumScoreThreshold) {
403
+ if (accuracy < badScoreThreshold) {
404
+ currentAudioForPlaying = soundFileBad;
405
+ }
406
+ else {
407
+ currentAudioForPlaying = soundFileOkay;
408
+ }
409
+ }
410
+ playback();
411
+
412
+ }
413
+
414
+ const playAudio = async () => {
415
+
416
+ document.getElementById("main_title").innerHTML = "Generating sound...";
417
+ playWithMozillaApi(currentText[0]);
418
+ document.getElementById("main_title").innerHTML = "Current Sound was played";
419
+
420
+ };
421
+
422
+ function playback() {
423
+ const playSound = ctx.createBufferSource();
424
+ playSound.buffer = currentAudioForPlaying;
425
+ playSound.connect(ctx.destination);
426
+ playSound.start(ctx.currentTime)
427
+ }
428
+
429
+
430
+ const playRecording = async (start = null, end = null) => {
431
+ blockUI();
432
+
433
+ try {
434
+ if (start == null || end == null) {
435
+ endTimeInMs = Math.round(audioRecorded.duration * 1000)
436
+ audioRecorded.addEventListener("ended", function () {
437
+ audioRecorded.currentTime = 0;
438
+ unblockUI();
439
+ document.getElementById("main_title").innerHTML = "Recorded Sound was played";
440
+ });
441
+ await audioRecorded.play();
442
+
443
+ }
444
+ else {
445
+ audioRecorded.currentTime = start;
446
+ audioRecorded.play();
447
+ durationInSeconds = end - start;
448
+ endTimeInMs = Math.round(durationInSeconds * 1000);
449
+ setTimeout(function () {
450
+ unblockUI();
451
+ audioRecorded.pause();
452
+ audioRecorded.currentTime = 0;
453
+ document.getElementById("main_title").innerHTML = "Recorded Sound was played";
454
+ }, endTimeInMs);
455
+
456
+ }
457
+ }
458
+ catch {
459
+ UINotSupported();
460
+ }
461
+ };
462
+
463
+ const playNativeAndRecordedWord = async (word_idx) => {
464
+
465
+ if (isNativeSelectedForPlayback)
466
+ playCurrentWord(word_idx)
467
+ else
468
+ playRecordedWord(word_idx);
469
+
470
+ isNativeSelectedForPlayback = !isNativeSelectedForPlayback;
471
+ }
472
+
473
+ const stopRecording = () => {
474
+ isRecording = false
475
+ mediaRecorder.stop()
476
+ document.getElementById("main_title").innerHTML = "Processing audio...";
477
+ }
478
+
479
+
480
+ const playCurrentWord = async (word_idx) => {
481
+
482
+ document.getElementById("main_title").innerHTML = "Generating word...";
483
+ playWithMozillaApi(currentText[0].split(' ')[word_idx]);
484
+ document.getElementById("main_title").innerHTML = "Word was played";
485
+ }
486
+
487
+ // TODO: Check if fallback is correct
488
+ const playWithMozillaApi = (text) => {
489
+
490
+ if (languageFound) {
491
+ blockUI();
492
+ if (voice_synth == null)
493
+ changeLanguage(AILanguage);
494
+
495
+ var utterThis = new SpeechSynthesisUtterance(text);
496
+ utterThis.voice = voice_synth;
497
+ utterThis.rate = 0.7;
498
+ utterThis.onend = function (event) {
499
+ unblockUI();
500
+ }
501
+ synth.speak(utterThis);
502
+ }
503
+ else {
504
+ UINotSupported();
505
+ }
506
+ }
507
+
508
+ const playRecordedWord = (word_idx) => {
509
+
510
+ wordStartTime = parseFloat(startTime.split(' ')[word_idx]);
511
+ wordEndTime = parseFloat(endTime.split(' ')[word_idx]);
512
+
513
+ playRecording(wordStartTime, wordEndTime);
514
+
515
+ }
516
+
517
+ // ############# Utils #####################
518
+ const convertBlobToBase64 = async (blob) => {
519
+ return await blobToBase64(blob);
520
+ }
521
+
522
+ const blobToBase64 = blob => new Promise((resolve, reject) => {
523
+ const reader = new FileReader();
524
+ reader.readAsDataURL(blob);
525
+ reader.onload = () => resolve(reader.result);
526
+ reader.onerror = error => reject(error);
527
+ });
528
+
529
+ const wrapWordForPlayingLink = (word, word_idx, isFromRecording, word_accuracy_color) => {
530
+ if (isFromRecording)
531
+ return '<a style = " white-space:nowrap; color:' + word_accuracy_color + '; " href="javascript:playRecordedWord(' + word_idx.toString() + ')" >' + word + '</a> '
532
+ else
533
+ return '<a style = " white-space:nowrap; color:' + word_accuracy_color + '; " href="javascript:playCurrentWord(' + word_idx.toString() + ')" >' + word + '</a> '
534
+ }
535
+
536
+ const wrapWordForIndividualPlayback = (word, word_idx) => {
537
+
538
+
539
+ return '<a onmouseover="generateWordModal(' + word_idx.toString() + ')" style = " white-space:nowrap; " href="javascript:playNativeAndRecordedWord(' + word_idx.toString() + ')" >' + word + '</a> '
540
+
541
+ }
542
+
543
+ // ########## Function to initialize server ###############
544
+ // This is to try to avoid aws lambda cold start
545
+ try {
546
+ fetch(apiMainPathSTS + '/GetAccuracyFromRecordedAudio', {
547
+ method: "post",
548
+ body: JSON.stringify({ "title": '', "base64Audio": '', "language": AILanguage }),
549
+ headers: { "X-Api-Key": STScoreAPIKey }
550
+
551
+ });
552
+ }
553
+ catch { }
554
+
555
+ const initializeServer = async () => {
556
+
557
+ valid_response = false;
558
+ document.getElementById("main_title").innerHTML = 'Initializing server, this may take up to 2 minutes...';
559
+ let number_of_tries = 0;
560
+ let maximum_number_of_tries = 4;
561
+
562
+ while (!valid_response) {
563
+ if (number_of_tries > maximum_number_of_tries) {
564
+ serverWorking = false;
565
+ break;
566
+ }
567
+
568
+ try {
569
+ await fetch(apiMainPathSTS + '/GetAccuracyFromRecordedAudio', {
570
+ method: "post",
571
+ body: JSON.stringify({ "title": '', "base64Audio": '', "language": AILanguage }),
572
+ headers: { "X-Api-Key": STScoreAPIKey }
573
+
574
+ }).then(
575
+ valid_response = true);
576
+ serverIsInitialized = true;
577
+ }
578
+ catch
579
+ {
580
+ number_of_tries += 1;
581
+ }
582
+ }
583
+ }
584
+
templates/.DS_Store ADDED
Binary file (6.15 kB). View file
 
templates/main.html ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1">
7
+
8
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet"
9
+ integrity="sha384-1BmE4kWBq78iYhFldvKuhfTAU6auU8tT94WrHftjDbrCEXSU1oBoqyl2QvZ6jIW3" crossorigin="anonymous">
10
+
11
+ </script>
12
+ <link rel="stylesheet" href="../static/css/style-new.css">
13
+ <script src="../static/javascript/callbacks.js"></script>
14
+
15
+
16
+ <title>AI pronunciation trainer</title>
17
+
18
+ <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js" type="text/javascript"></script>
19
+
20
+ <link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
21
+
22
+
23
+
24
+ </head>
25
+
26
+ <body style="height: 100%; width: 100%; background-color: white; max-width: 90%;">
27
+
28
+ <div>
29
+
30
+ <div style="display:flex; flex-direction: row;">
31
+
32
+
33
+ <div style="display:inline-block; margin-left: 1.5em;">
34
+ <i class="material-icons icon-text-home" style="text-align: right;" onclick="history.go(0)">home</i>
35
+ </div>
36
+
37
+ <h1 id='main_title'> AI Pronunciation Trainer
38
+ </h1>
39
+ </div>
40
+
41
+
42
+ <div class="expanded">
43
+ <div class="horizontal-flexbox" style="display:flex; flex-direction: row;">
44
+
45
+
46
+ <p class="accuracy-text" style="font-size:1em; text-align: left; padding-top: 3px; padding-left: 5px;">
47
+ Language: </p>
48
+ <!--dropbtn accuracy-text-->
49
+ <div class="dropdown">
50
+ <button id="languageBox" class="dropbtn accuracy-text"
51
+ style="font-size:1em; text-align: left; padding-top: 3px; padding-left: 0px;">German</button>
52
+ <div class="dropdown-content">
53
+ <a href="javascript:changeLanguage('de',true)" class="accuracy-text"
54
+ style="padding-top: 3px; ">German</a>
55
+ <a href="javascript:changeLanguage('en',true)" class="accuracy-text ;"
56
+ style="padding-top: 3px; ">English</a>
57
+ </div>
58
+ </div>
59
+
60
+ <p id="section_accuracy" class="accuracy-text"
61
+ style="text-align: left; color: black; font-size: larger;">
62
+ | Score: 0
63
+ </p>
64
+
65
+ </div>
66
+ </div>
67
+
68
+ <div style="margin-bottom: 200px;">
69
+
70
+ </div>
71
+
72
+
73
+ <div class="container">
74
+
75
+ <div class="horizontal-flexbox" style="position: absolute; top: 2%; ">
76
+
77
+ <a id="playSampleAudio" href="javascript:playAudio()" class="round-button disabled" style="color:white; text-align:center;
78
+ position: absolute; top: 2%; "><i class="material-icons icon-text">play_arrow</i>
79
+ </a>
80
+
81
+ <a id="playRecordedAudio" href="javascript:playRecording()" class="round-button disabled"
82
+ style="color:white; text-align:center; position: absolute; top: 15%; "><i
83
+ class="material-icons icon-text">record_voice_over</i>
84
+ </a>
85
+ <p id="pronunciation_accuracy" class="expanded accuracy-text"
86
+ style="text-align: center; color: black; position: absolute; top: 27%; ">
87
+ -
88
+ </p>
89
+
90
+ </div>
91
+
92
+ <div id="text-area" class="main-text-div">
93
+
94
+ <p id="original_script" class=" bigger-text text-primary main-text">Click on the bar on the
95
+ right
96
+ to
97
+ generate a
98
+ new sentence (please use chrome web browser).
99
+ </p>
100
+ <p id="ipa_script" class="text-muted bigger-text ipa-text"> Before speaking, click on the mic button
101
+ below to start recording and then click again when you're done.
102
+ </p>
103
+ <p id="recorded_ipa_script" class="text-primary ipa-text">On the left bottom you can choose the
104
+ difficult. On the upper left you can choose the language.
105
+ </p>
106
+ <p id="translated_script" class="text-muted medium-text ipa-text"> The corresponding IPA reading of each
107
+ sentence will also be displayed. If you never heard from IPA, you can check out this
108
+ <a href="https://www.youtube.com/watch?v=mzrLZi6fipA&list=RDCMUCQAUWk_yGz7bk1181DrijNw&start_radio=1&rv=mzrLZi6fipA&t=22&ab_channel=FluentForever"
109
+ target=”_blank”>playlist</a>. Try to get at least 690 points a day. Don't be shy! You can do it
110
+ :)
111
+ </p>
112
+
113
+ </div>
114
+
115
+ <div id="nextButtonDiv" style="position: absolute; left: 90%; top:0%; height: 100%;" class="flex-container">
116
+ <button id="buttonNext" class="expanded button-next" onclick="javascript:getNextSample()">
117
+ <span></span></a>
118
+ </div>
119
+ </div>
120
+
121
+
122
+
123
+
124
+ <div class="container-small flex expand"
125
+ style="align-items: center; text-align: center; vertical-align:middle; ">
126
+ <p id="single_word_ipa_pair" class="expand ipa-text-small"
127
+ style="text-align: center; vertical-align: middle;">Reference | Spoken
128
+ </p>
129
+ </div>
130
+
131
+
132
+ <div id="btn-record" class="expanded mic-button-div">
133
+ <a id="recordAudio" href="javascript:updateRecordingState()" class="round-button-mic disabled"
134
+ style="color:white; text-align:center; "><i id="recordIcon" class="material-icons icon-text-mic">mic</i>
135
+ </a>
136
+ </div>
137
+
138
+
139
+ <div id="radio-difficulty" class="radio" style="position: fixed; top: 95%; left: 2%;">
140
+ <input label="Random" type="radio" id="lengthCat1" name='length' onclick="javascript:getNextSample()">
141
+ <input label="Easy" type="radio" id="lengthCat2" name='length' checked onclick="javascript:getNextSample()">
142
+ <input label="Medium" type="radio" id="lengthCat3" name='length' onclick="javascript:getNextSample()">
143
+ <input label="Hard" type="radio" id="lengthCat4" name='length' onclick="javascript:getNextSample()">
144
+ </div>
145
+
146
+ </div>
147
+
148
+
149
+ <p class="credits-icon-div">By Thiago
150
+ Lobato.</p>
151
+
152
+ <div class="link-icon-div">
153
+ <a href="https://github.com/Thiagohgl/ai-pronunciation-trainer" target=”_blank”
154
+ style="text-decoration:none; vertical-align: middle; ">
155
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" class="svg-icon">
156
+ <path
157
+ d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z" />
158
+ </svg>
159
+ </a>
160
+
161
+ <a href="https://www.linkedin.com/in/thiagohgl/" target=”_blank”
162
+ style="text-decoration:none; vertical-align: middle; padding-top: 2.3em; ">
163
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" class="svg-icon">
164
+ <path
165
+ d="M19 0h-14c-2.761 0-5 2.239-5 5v14c0 2.761 2.239 5 5 5h14c2.762 0 5-2.239 5-5v-14c0-2.761-2.238-5-5-5zm-11 19h-3v-11h3v11zm-1.5-12.268c-.966 0-1.75-.79-1.75-1.764s.784-1.764 1.75-1.764 1.75.79 1.75 1.764-.783 1.764-1.75 1.764zm13.5 12.268h-3v-5.604c0-3.368-4-3.113-4 0v5.604h-3v-11h3v1.765c1.396-2.586 7-2.777 7 2.476v6.759z" />
166
+ </svg>
167
+
168
+ </a>
169
+ </div>
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+ </body>
178
+
179
+
180
+
181
+ </html>
unitTests.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+
3
+ import ModelInterfaces
4
+ import lambdaGetSample
5
+ import RuleBasedModels
6
+ import epitran
7
+ import json
8
+ import pronunciationTrainer
9
+
10
+
11
+ def test_category(category: int, threshold_min: int, threshold_max: int):
12
+ event = {'body': json.dumps({'category': category, 'language': 'de'})}
13
+ for _ in range(1000):
14
+ response = lambdaGetSample.lambda_handler(event, [])
15
+ response_dict = json.loads(response)
16
+ number_of_words = len(
17
+ response_dict['real_transcript'][0].split())
18
+ length_valid = number_of_words > threshold_min and number_of_words <= threshold_max
19
+ if not length_valid:
20
+ print('Category ', category,
21
+ ' had a sentence with length ', number_of_words)
22
+ return False
23
+ return True
24
+
25
+
26
+ class TestDataset(unittest.TestCase):
27
+
28
+ def test_random_sentences(self):
29
+
30
+ self.assertFalse(test_category(0, 0, 8))
31
+
32
+ def test_easy_sentences(self):
33
+
34
+ self.assertTrue(test_category(1, 0, 8))
35
+
36
+ def test_normal_sentences(self):
37
+ self.assertTrue(test_category(2, 8, 20))
38
+
39
+ def test_hard_sentences(self):
40
+ self.assertTrue(test_category(3, 20, 10000))
41
+
42
+
43
+ def check_phonem_converter(converter: ModelInterfaces.ITextToPhonemModel, input: str, expected_output: str):
44
+ output = converter.convertToPhonem(input)
45
+
46
+ is_correct = output == expected_output
47
+ if not is_correct:
48
+ print('Conversion from "', input, '" should be "',
49
+ expected_output, '", but was "', output, '"')
50
+ return is_correct
51
+
52
+
53
+ class TestPhonemConverter(unittest.TestCase):
54
+
55
+ def test_english(self):
56
+ phonem_converter = RuleBasedModels.EngPhonemConverter()
57
+ self.assertTrue(check_phonem_converter(
58
+ phonem_converter, 'Hello, this is a test', 'hɛˈloʊ, ðɪs ɪz ə tɛst'))
59
+
60
+ def test_german(self):
61
+ phonem_converter = RuleBasedModels.EpitranPhonemConverter(
62
+ epitran.Epitran('deu-Latn'))
63
+
64
+ self.assertTrue(check_phonem_converter(
65
+ phonem_converter, 'Hallo, das ist ein Test', 'haloː, dɑːs ɪst ain tɛst'))
66
+
67
+
68
+ trainer_SST_lambda = {}
69
+ trainer_SST_lambda['de'] = pronunciationTrainer.getTrainer("de")
70
+
71
+
72
+ class TestScore(unittest.TestCase):
73
+
74
+ def test_exact_transcription(self):
75
+ words_real = 'Ich habe sehr viel glück, am leben und gesund zu sein'
76
+
77
+ real_and_transcribed_words, _, _ = trainer_SST_lambda['de'].matchSampleAndRecordedWords(
78
+ words_real, words_real)
79
+
80
+ pronunciation_accuracy, _ = trainer_SST_lambda['de'].getPronunciationAccuracy(
81
+ real_and_transcribed_words)
82
+
83
+ self.assertTrue(int(pronunciation_accuracy) == 100)
84
+
85
+ def test_incorrect_transcription(self):
86
+ words_real = 'Ich habe sehr viel glück, am leben und gesund zu sein'
87
+ words_transcribed = 'Ic hab zeh viel guck am und gesund tu sein'
88
+
89
+ real_and_transcribed_words, _, _ = trainer_SST_lambda['de'].matchSampleAndRecordedWords(
90
+ words_real, words_transcribed)
91
+
92
+ pronunciation_accuracy, _ = trainer_SST_lambda['de'].getPronunciationAccuracy(
93
+ real_and_transcribed_words)
94
+
95
+ self.assertTrue(int(pronunciation_accuracy) == 71)
96
+
97
+
98
+ if __name__ == '__main__':
99
+ unittest.main()
utilsFileIO.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ import random
3
+
4
+
5
+ def generateRandomString(str_length: int = 20):
6
+
7
+ # printing lowercase
8
+ letters = string.ascii_lowercase
9
+ return ''.join(random.choice(letters) for i in range(str_length))
webApp.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request
2
+ import webbrowser
3
+ import os
4
+ from flask_cors import CORS
5
+ import json
6
+
7
+ import lambdaTTS
8
+ import lambdaSpeechToScore
9
+ import lambdaGetSample
10
+
11
+ app = Flask(__name__)
12
+ cors = CORS(app)
13
+ app.config['CORS_HEADERS'] = '*'
14
+
15
+ rootPath = ''
16
+
17
+
18
+ @app.route(rootPath+'/')
19
+ def main():
20
+ return render_template('main.html')
21
+
22
+
23
+ @app.route(rootPath+'/getAudioFromText', methods=['POST'])
24
+ def getAudioFromText():
25
+ event = {'body': json.dumps(request.get_json(force=True))}
26
+ return lambdaTTS.lambda_handler(event, [])
27
+
28
+
29
+ @app.route(rootPath+'/getSample', methods=['POST'])
30
+ def getNext():
31
+ event = {'body': json.dumps(request.get_json(force=True))}
32
+ return lambdaGetSample.lambda_handler(event, [])
33
+
34
+
35
+ @app.route(rootPath+'/GetAccuracyFromRecordedAudio', methods=['POST'])
36
+ def GetAccuracyFromRecordedAudio():
37
+
38
+ event = {'body': json.dumps(request.get_json(force=True))}
39
+ lambda_correct_output = lambdaSpeechToScore.lambda_handler(event, [])
40
+ return lambda_correct_output
41
+
42
+
43
+ if __name__ == "__main__":
44
+ language = 'de'
45
+ print(os.system('pwd'))
46
+ webbrowser.open_new('http://127.0.0.1:3000/')
47
+ app.run(host="0.0.0.0", port=3000)