Spaces:
Build error
Build error
Commit
·
385b498
1
Parent(s):
525a4f7
Upload ArticulatoryCombinedTextFrontend.py
Browse files
ArticulatoryCombinedTextFrontend.py
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import sys
|
3 |
+
|
4 |
+
import panphon
|
5 |
+
import phonemizer
|
6 |
+
import torch
|
7 |
+
|
8 |
+
from Preprocessing.papercup_features import generate_feature_table
|
9 |
+
|
10 |
+
|
11 |
+
class ArticulatoryCombinedTextFrontend:
|
12 |
+
|
13 |
+
def __init__(self,
|
14 |
+
language,
|
15 |
+
use_word_boundaries=False, # goes together well with
|
16 |
+
# parallel models and an aligner. Doesn't go together
|
17 |
+
# well with autoregressive models.
|
18 |
+
use_explicit_eos=True,
|
19 |
+
use_prosody=False, # unfortunately the non-segmental
|
20 |
+
# nature of prosodic markers mixed with the sequential
|
21 |
+
# phonemes hurts the performance of end-to-end models a
|
22 |
+
# lot, even though one might think enriching the input
|
23 |
+
# with such information would help.
|
24 |
+
use_lexical_stress=False,
|
25 |
+
silent=True,
|
26 |
+
allow_unknown=False,
|
27 |
+
add_silence_to_end=True,
|
28 |
+
strip_silence=True):
|
29 |
+
"""
|
30 |
+
Mostly preparing ID lookups
|
31 |
+
"""
|
32 |
+
self.strip_silence = strip_silence
|
33 |
+
self.use_word_boundaries = use_word_boundaries
|
34 |
+
self.allow_unknown = allow_unknown
|
35 |
+
self.use_explicit_eos = use_explicit_eos
|
36 |
+
self.use_prosody = use_prosody
|
37 |
+
self.use_stress = use_lexical_stress
|
38 |
+
self.add_silence_to_end = add_silence_to_end
|
39 |
+
self.feature_table = panphon.FeatureTable()
|
40 |
+
|
41 |
+
if language == "en":
|
42 |
+
self.g2p_lang = "en-us"
|
43 |
+
self.expand_abbreviations = english_text_expansion
|
44 |
+
if not silent:
|
45 |
+
print("Created an English Text-Frontend")
|
46 |
+
|
47 |
+
elif language == "de":
|
48 |
+
self.g2p_lang = "de"
|
49 |
+
self.expand_abbreviations = lambda x: x
|
50 |
+
if not silent:
|
51 |
+
print("Created a German Text-Frontend")
|
52 |
+
|
53 |
+
elif language == "el":
|
54 |
+
self.g2p_lang = "el"
|
55 |
+
self.expand_abbreviations = lambda x: x
|
56 |
+
if not silent:
|
57 |
+
print("Created a Greek Text-Frontend")
|
58 |
+
|
59 |
+
elif language == "es":
|
60 |
+
self.g2p_lang = "es"
|
61 |
+
self.expand_abbreviations = lambda x: x
|
62 |
+
if not silent:
|
63 |
+
print("Created a Spanish Text-Frontend")
|
64 |
+
|
65 |
+
elif language == "fi":
|
66 |
+
self.g2p_lang = "fi"
|
67 |
+
self.expand_abbreviations = lambda x: x
|
68 |
+
if not silent:
|
69 |
+
print("Created a Finnish Text-Frontend")
|
70 |
+
|
71 |
+
elif language == "ru":
|
72 |
+
self.g2p_lang = "ru"
|
73 |
+
self.expand_abbreviations = lambda x: x
|
74 |
+
if not silent:
|
75 |
+
print("Created a Russian Text-Frontend")
|
76 |
+
|
77 |
+
elif language == "hu":
|
78 |
+
self.g2p_lang = "hu"
|
79 |
+
self.expand_abbreviations = lambda x: x
|
80 |
+
if not silent:
|
81 |
+
print("Created a Hungarian Text-Frontend")
|
82 |
+
|
83 |
+
elif language == "nl":
|
84 |
+
self.g2p_lang = "nl"
|
85 |
+
self.expand_abbreviations = lambda x: x
|
86 |
+
if not silent:
|
87 |
+
print("Created a Dutch Text-Frontend")
|
88 |
+
|
89 |
+
elif language == "fr":
|
90 |
+
self.g2p_lang = "fr-fr"
|
91 |
+
self.expand_abbreviations = lambda x: x
|
92 |
+
if not silent:
|
93 |
+
print("Created a French Text-Frontend")
|
94 |
+
|
95 |
+
elif language == "it":
|
96 |
+
self.g2p_lang = "it"
|
97 |
+
self.expand_abbreviations = lambda x: x
|
98 |
+
if not silent:
|
99 |
+
print("Created a Italian Text-Frontend")
|
100 |
+
|
101 |
+
elif language == "pt":
|
102 |
+
self.g2p_lang = "pt"
|
103 |
+
self.expand_abbreviations = lambda x: x
|
104 |
+
if not silent:
|
105 |
+
print("Created a Portuguese Text-Frontend")
|
106 |
+
|
107 |
+
elif language == "pl":
|
108 |
+
self.g2p_lang = "pl"
|
109 |
+
self.expand_abbreviations = lambda x: x
|
110 |
+
if not silent:
|
111 |
+
print("Created a Polish Text-Frontend")
|
112 |
+
|
113 |
+
# remember to also update get_language_id() when adding something here
|
114 |
+
|
115 |
+
else:
|
116 |
+
print("Language not supported yet")
|
117 |
+
sys.exit()
|
118 |
+
|
119 |
+
self.phone_to_vector_papercup = generate_feature_table()
|
120 |
+
|
121 |
+
self.phone_to_vector = dict()
|
122 |
+
for phone in self.phone_to_vector_papercup:
|
123 |
+
panphon_features = self.feature_table.word_to_vector_list(phone, numeric=True)
|
124 |
+
if panphon_features == []:
|
125 |
+
panphon_features = [[0] * 24]
|
126 |
+
papercup_features = self.phone_to_vector_papercup[phone]
|
127 |
+
self.phone_to_vector[phone] = papercup_features + panphon_features[0]
|
128 |
+
|
129 |
+
self.phone_to_id = { # this lookup must be updated manually, because the only
|
130 |
+
# other way would be extracting them from a set, which can be non-deterministic
|
131 |
+
'~': 0,
|
132 |
+
'#': 1,
|
133 |
+
'?': 2,
|
134 |
+
'!': 3,
|
135 |
+
'.': 4,
|
136 |
+
'ɜ': 5,
|
137 |
+
'ɫ': 6,
|
138 |
+
'ə': 7,
|
139 |
+
'ɚ': 8,
|
140 |
+
'a': 9,
|
141 |
+
'ð': 10,
|
142 |
+
'ɛ': 11,
|
143 |
+
'ɪ': 12,
|
144 |
+
'ᵻ': 13,
|
145 |
+
'ŋ': 14,
|
146 |
+
'ɔ': 15,
|
147 |
+
'ɒ': 16,
|
148 |
+
'ɾ': 17,
|
149 |
+
'ʃ': 18,
|
150 |
+
'θ': 19,
|
151 |
+
'ʊ': 20,
|
152 |
+
'ʌ': 21,
|
153 |
+
'ʒ': 22,
|
154 |
+
'æ': 23,
|
155 |
+
'b': 24,
|
156 |
+
'ʔ': 25,
|
157 |
+
'd': 26,
|
158 |
+
'e': 27,
|
159 |
+
'f': 28,
|
160 |
+
'g': 29,
|
161 |
+
'h': 30,
|
162 |
+
'i': 31,
|
163 |
+
'j': 32,
|
164 |
+
'k': 33,
|
165 |
+
'l': 34,
|
166 |
+
'm': 35,
|
167 |
+
'n': 36,
|
168 |
+
'ɳ': 37,
|
169 |
+
'o': 38,
|
170 |
+
'p': 39,
|
171 |
+
'ɡ': 40,
|
172 |
+
'ɹ': 41,
|
173 |
+
'r': 42,
|
174 |
+
's': 43,
|
175 |
+
't': 44,
|
176 |
+
'u': 45,
|
177 |
+
'v': 46,
|
178 |
+
'w': 47,
|
179 |
+
'x': 48,
|
180 |
+
'z': 49,
|
181 |
+
'ʀ': 50,
|
182 |
+
'ø': 51,
|
183 |
+
'ç': 52,
|
184 |
+
'ɐ': 53,
|
185 |
+
'œ': 54,
|
186 |
+
'y': 55,
|
187 |
+
'ʏ': 56,
|
188 |
+
'ɑ': 57,
|
189 |
+
'c': 58,
|
190 |
+
'ɲ': 59,
|
191 |
+
'ɣ': 60,
|
192 |
+
'ʎ': 61,
|
193 |
+
'β': 62,
|
194 |
+
'ʝ': 63,
|
195 |
+
'ɟ': 64,
|
196 |
+
'q': 65,
|
197 |
+
'ɕ': 66,
|
198 |
+
'ʲ': 67,
|
199 |
+
'ɭ': 68,
|
200 |
+
'ɵ': 69,
|
201 |
+
'ʑ': 70,
|
202 |
+
'ʋ': 71,
|
203 |
+
'ʁ': 72,
|
204 |
+
'ɨ': 73,
|
205 |
+
'ʂ': 74,
|
206 |
+
'ɬ': 75,
|
207 |
+
} # for the states of the ctc loss and dijkstra/mas in the aligner
|
208 |
+
|
209 |
+
self.id_to_phone = {v: k for k, v in self.phone_to_id.items()}
|
210 |
+
|
211 |
+
def string_to_tensor(self, text, view=False, device="cpu", handle_missing=True, input_phonemes=False):
|
212 |
+
"""
|
213 |
+
Fixes unicode errors, expands some abbreviations,
|
214 |
+
turns graphemes into phonemes and then vectorizes
|
215 |
+
the sequence as articulatory features
|
216 |
+
"""
|
217 |
+
if input_phonemes:
|
218 |
+
phones = text
|
219 |
+
else:
|
220 |
+
phones = self.get_phone_string(text=text, include_eos_symbol=True)
|
221 |
+
if view:
|
222 |
+
print("Phonemes: \n{}\n".format(phones))
|
223 |
+
phones_vector = list()
|
224 |
+
# turn into numeric vectors
|
225 |
+
for char in phones:
|
226 |
+
if handle_missing:
|
227 |
+
try:
|
228 |
+
phones_vector.append(self.phone_to_vector[char])
|
229 |
+
except KeyError:
|
230 |
+
print("unknown phoneme: {}".format(char))
|
231 |
+
else:
|
232 |
+
phones_vector.append(self.phone_to_vector[char]) # leave error handling to elsewhere
|
233 |
+
|
234 |
+
return torch.Tensor(phones_vector, device=device)
|
235 |
+
|
236 |
+
def get_phone_string(self, text, include_eos_symbol=True):
|
237 |
+
# expand abbreviations
|
238 |
+
utt = self.expand_abbreviations(text)
|
239 |
+
# phonemize
|
240 |
+
phones = phonemizer.phonemize(utt,
|
241 |
+
language_switch='remove-flags',
|
242 |
+
backend="espeak",
|
243 |
+
language=self.g2p_lang,
|
244 |
+
preserve_punctuation=True,
|
245 |
+
strip=True,
|
246 |
+
punctuation_marks=';:,.!?¡¿—…"«»“”~/',
|
247 |
+
with_stress=self.use_stress).replace(";", ",").replace("/", " ").replace("—", "") \
|
248 |
+
.replace(":", ",").replace('"', ",").replace("-", ",").replace("...", ",").replace("-", ",").replace("\n", " ") \
|
249 |
+
.replace("\t", " ").replace("¡", "").replace("¿", "").replace(",", "~").replace(" ̃", "").replace('̩', "").replace("̃", "").replace("̪", "")
|
250 |
+
# less than 1 wide characters hidden here
|
251 |
+
phones = re.sub("~+", "~", phones)
|
252 |
+
if not self.use_prosody:
|
253 |
+
# retain ~ as heuristic pause marker, even though all other symbols are removed with this option.
|
254 |
+
# also retain . ? and ! since they can be indicators for the stop token
|
255 |
+
phones = phones.replace("ˌ", "").replace("ː", "").replace("ˑ", "") \
|
256 |
+
.replace("˘", "").replace("|", "").replace("‖", "")
|
257 |
+
if not self.use_word_boundaries:
|
258 |
+
phones = phones.replace(" ", "")
|
259 |
+
else:
|
260 |
+
phones = re.sub(r"\s+", " ", phones)
|
261 |
+
phones = re.sub(" ", "~", phones)
|
262 |
+
if self.strip_silence:
|
263 |
+
phones = phones.lstrip("~").rstrip("~")
|
264 |
+
if self.add_silence_to_end:
|
265 |
+
phones += "~" # adding a silence in the end during add_silence_to_end produces more natural sounding prosody
|
266 |
+
if include_eos_symbol:
|
267 |
+
phones += "#"
|
268 |
+
|
269 |
+
phones = "~" + phones
|
270 |
+
phones = re.sub("~+", "~", phones)
|
271 |
+
|
272 |
+
return phones
|
273 |
+
|
274 |
+
|
275 |
+
def english_text_expansion(text):
|
276 |
+
"""
|
277 |
+
Apply as small part of the tacotron style text cleaning pipeline, suitable for e.g. LJSpeech.
|
278 |
+
See https://github.com/keithito/tacotron/
|
279 |
+
Careful: Only apply to english datasets. Different languages need different cleaners.
|
280 |
+
"""
|
281 |
+
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
|
282 |
+
[('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
|
283 |
+
('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
|
284 |
+
('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort')]]
|
285 |
+
for regex, replacement in _abbreviations:
|
286 |
+
text = re.sub(regex, replacement, text)
|
287 |
+
return text
|
288 |
+
|
289 |
+
|
290 |
+
def get_language_id(language):
|
291 |
+
if language == "en":
|
292 |
+
return torch.LongTensor([12])
|
293 |
+
elif language == "de":
|
294 |
+
return torch.LongTensor([1])
|
295 |
+
elif language == "el":
|
296 |
+
return torch.LongTensor([2])
|
297 |
+
elif language == "es":
|
298 |
+
return torch.LongTensor([3])
|
299 |
+
elif language == "fi":
|
300 |
+
return torch.LongTensor([4])
|
301 |
+
elif language == "ru":
|
302 |
+
return torch.LongTensor([5])
|
303 |
+
elif language == "hu":
|
304 |
+
return torch.LongTensor([6])
|
305 |
+
elif language == "nl":
|
306 |
+
return torch.LongTensor([7])
|
307 |
+
elif language == "fr":
|
308 |
+
return torch.LongTensor([8])
|
309 |
+
elif language == "pt":
|
310 |
+
return torch.LongTensor([9])
|
311 |
+
elif language == "pl":
|
312 |
+
return torch.LongTensor([10])
|
313 |
+
elif language == "it":
|
314 |
+
return torch.LongTensor([11])
|
315 |
+
|
316 |
+
|
317 |
+
if __name__ == '__main__':
|
318 |
+
# test an English utterance
|
319 |
+
tfr_en = ArticulatoryCombinedTextFrontend(language="en")
|
320 |
+
print(tfr_en.string_to_tensor("This is a complex sentence, it even has a pause! But can it do this? Nice.", view=True))
|
321 |
+
|
322 |
+
tfr_en = ArticulatoryCombinedTextFrontend(language="de")
|
323 |
+
print(tfr_en.string_to_tensor("Alles klar, jetzt testen wir einen deutschen Satz. Ich hoffe es gibt nicht mehr viele unspezifizierte Phoneme.", view=True))
|