Spaces:

TheComputerMan
/

IMS-Toucan-modified

Build error

App Files Files Community

TheComputerMan commited on Aug 27, 2022

Commit

385b498

1 Parent(s): 525a4f7

Upload ArticulatoryCombinedTextFrontend.py

Browse files

Files changed (1) hide show

ArticulatoryCombinedTextFrontend.py +323 -0

ArticulatoryCombinedTextFrontend.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import re
+import sys
+import panphon
+import phonemizer
+import torch
+from Preprocessing.papercup_features import generate_feature_table
+class ArticulatoryCombinedTextFrontend:
+    def __init__(self,
+                 language,
+                 use_word_boundaries=False,  # goes together well with
+                 # parallel models and an aligner. Doesn't go together
+                 # well with autoregressive models.
+                 use_explicit_eos=True,
+                 use_prosody=False,  # unfortunately the non-segmental
+                 # nature of prosodic markers mixed with the sequential
+                 # phonemes hurts the performance of end-to-end models a
+                 # lot, even though one might think enriching the input
+                 # with such information would help.
+                 use_lexical_stress=False,
+                 silent=True,
+                 allow_unknown=False,
+                 add_silence_to_end=True,
+                 strip_silence=True):
+        """
+        Mostly preparing ID lookups
+        """
+        self.strip_silence = strip_silence
+        self.use_word_boundaries = use_word_boundaries
+        self.allow_unknown = allow_unknown
+        self.use_explicit_eos = use_explicit_eos
+        self.use_prosody = use_prosody
+        self.use_stress = use_lexical_stress
+        self.add_silence_to_end = add_silence_to_end
+        self.feature_table = panphon.FeatureTable()
+        if language == "en":
+            self.g2p_lang = "en-us"
+            self.expand_abbreviations = english_text_expansion
+            if not silent:
+                print("Created an English Text-Frontend")
+        elif language == "de":
+            self.g2p_lang = "de"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a German Text-Frontend")
+        elif language == "el":
+            self.g2p_lang = "el"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Greek Text-Frontend")
+        elif language == "es":
+            self.g2p_lang = "es"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Spanish Text-Frontend")
+        elif language == "fi":
+            self.g2p_lang = "fi"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Finnish Text-Frontend")
+        elif language == "ru":
+            self.g2p_lang = "ru"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Russian Text-Frontend")
+        elif language == "hu":
+            self.g2p_lang = "hu"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Hungarian Text-Frontend")
+        elif language == "nl":
+            self.g2p_lang = "nl"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Dutch Text-Frontend")
+        elif language == "fr":
+            self.g2p_lang = "fr-fr"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a French Text-Frontend")
+        elif language == "it":
+            self.g2p_lang = "it"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Italian Text-Frontend")
+        elif language == "pt":
+            self.g2p_lang = "pt"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Portuguese Text-Frontend")
+        elif language == "pl":
+            self.g2p_lang = "pl"
+            self.expand_abbreviations = lambda x: x
+            if not silent:
+                print("Created a Polish Text-Frontend")
+        # remember to also update get_language_id() when adding something here
+        else:
+            print("Language not supported yet")
+            sys.exit()
+        self.phone_to_vector_papercup = generate_feature_table()
+        self.phone_to_vector = dict()
+        for phone in self.phone_to_vector_papercup:
+            panphon_features = self.feature_table.word_to_vector_list(phone, numeric=True)
+            if panphon_features == []:
+                panphon_features = [[0] * 24]
+            papercup_features = self.phone_to_vector_papercup[phone]
+            self.phone_to_vector[phone] = papercup_features + panphon_features[0]
+        self.phone_to_id = {  # this lookup must be updated manually, because the only
+            # other way would be extracting them from a set, which can be non-deterministic
+            '~': 0,
+            '#': 1,
+            '?': 2,
+            '!': 3,
+            '.': 4,
+            'ɜ': 5,
+            'ɫ': 6,
+            'ə': 7,
+            'ɚ': 8,
+            'a': 9,
+            'ð': 10,
+            'ɛ': 11,
+            'ɪ': 12,
+            'ᵻ': 13,
+            'ŋ': 14,
+            'ɔ': 15,
+            'ɒ': 16,
+            'ɾ': 17,
+            'ʃ': 18,
+            'θ': 19,
+            'ʊ': 20,
+            'ʌ': 21,
+            'ʒ': 22,
+            'æ': 23,
+            'b': 24,
+            'ʔ': 25,
+            'd': 26,
+            'e': 27,
+            'f': 28,
+            'g': 29,
+            'h': 30,
+            'i': 31,
+            'j': 32,
+            'k': 33,
+            'l': 34,
+            'm': 35,
+            'n': 36,
+            'ɳ': 37,
+            'o': 38,
+            'p': 39,
+            'ɡ': 40,
+            'ɹ': 41,
+            'r': 42,
+            's': 43,
+            't': 44,
+            'u': 45,
+            'v': 46,
+            'w': 47,
+            'x': 48,
+            'z': 49,
+            'ʀ': 50,
+            'ø': 51,
+            'ç': 52,
+            'ɐ': 53,
+            'œ': 54,
+            'y': 55,
+            'ʏ': 56,
+            'ɑ': 57,
+            'c': 58,
+            'ɲ': 59,
+            'ɣ': 60,
+            'ʎ': 61,
+            'β': 62,
+            'ʝ': 63,
+            'ɟ': 64,
+            'q': 65,
+            'ɕ': 66,
+            'ʲ': 67,
+            'ɭ': 68,
+            'ɵ': 69,
+            'ʑ': 70,
+            'ʋ': 71,
+            'ʁ': 72,
+            'ɨ': 73,
+            'ʂ': 74,
+            'ɬ': 75,
+            }  # for the states of the ctc loss and dijkstra/mas in the aligner
+        self.id_to_phone = {v: k for k, v in self.phone_to_id.items()}
+    def string_to_tensor(self, text, view=False, device="cpu", handle_missing=True, input_phonemes=False):
+        """
+        Fixes unicode errors, expands some abbreviations,
+        turns graphemes into phonemes and then vectorizes
+        the sequence as articulatory features
+        """
+        if input_phonemes:
+            phones = text
+        else:
+            phones = self.get_phone_string(text=text, include_eos_symbol=True)
+        if view:
+            print("Phonemes: \n{}\n".format(phones))
+        phones_vector = list()
+        # turn into numeric vectors
+        for char in phones:
+            if handle_missing:
+                try:
+                    phones_vector.append(self.phone_to_vector[char])
+                except KeyError:
+                    print("unknown phoneme: {}".format(char))
+            else:
+                phones_vector.append(self.phone_to_vector[char])  # leave error handling to elsewhere
+        return torch.Tensor(phones_vector, device=device)
+    def get_phone_string(self, text, include_eos_symbol=True):
+        # expand abbreviations
+        utt = self.expand_abbreviations(text)
+        # phonemize
+        phones = phonemizer.phonemize(utt,
+                                      language_switch='remove-flags',
+                                      backend="espeak",
+                                      language=self.g2p_lang,
+                                      preserve_punctuation=True,
+                                      strip=True,
+                                      punctuation_marks=';:,.!?¡¿—…"«»“”~/',
+                                      with_stress=self.use_stress).replace(";", ",").replace("/", " ").replace("—", "") \
+            .replace(":", ",").replace('"', ",").replace("-", ",").replace("...", ",").replace("-", ",").replace("\n", " ") \
+            .replace("\t", " ").replace("¡", "").replace("¿", "").replace(",", "~").replace(" ̃", "").replace('̩', "").replace("̃", "").replace("̪", "")
+        # less than 1 wide characters hidden here
+        phones = re.sub("~+", "~", phones)
+        if not self.use_prosody:
+            # retain ~ as heuristic pause marker, even though all other symbols are removed with this option.
+            # also retain . ? and ! since they can be indicators for the stop token
+            phones = phones.replace("ˌ", "").replace("ː", "").replace("ˑ", "") \
+                .replace("˘", "").replace("|", "").replace("‖", "")
+        if not self.use_word_boundaries:
+            phones = phones.replace(" ", "")
+        else:
+            phones = re.sub(r"\s+", " ", phones)
+            phones = re.sub(" ", "~", phones)
+        if self.strip_silence:
+            phones = phones.lstrip("~").rstrip("~")
+        if self.add_silence_to_end:
+            phones += "~"  # adding a silence in the end during add_silence_to_end produces more natural sounding prosody
+        if include_eos_symbol:
+            phones += "#"
+        phones = "~" + phones
+        phones = re.sub("~+", "~", phones)
+        return phones
+def english_text_expansion(text):
+    """
+    Apply as small part of the tacotron style text cleaning pipeline, suitable for e.g. LJSpeech.
+    See https://github.com/keithito/tacotron/
+    Careful: Only apply to english datasets. Different languages need different cleaners.
+    """
+    _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
+                      [('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'),
+                       ('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'),
+                       ('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort')]]
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def get_language_id(language):
+    if language == "en":
+        return torch.LongTensor([12])
+    elif language == "de":
+        return torch.LongTensor([1])
+    elif language == "el":
+        return torch.LongTensor([2])
+    elif language == "es":
+        return torch.LongTensor([3])
+    elif language == "fi":
+        return torch.LongTensor([4])
+    elif language == "ru":
+        return torch.LongTensor([5])
+    elif language == "hu":
+        return torch.LongTensor([6])
+    elif language == "nl":
+        return torch.LongTensor([7])
+    elif language == "fr":
+        return torch.LongTensor([8])
+    elif language == "pt":
+        return torch.LongTensor([9])
+    elif language == "pl":
+        return torch.LongTensor([10])
+    elif language == "it":
+        return torch.LongTensor([11])
+if __name__ == '__main__':
+    # test an English utterance
+    tfr_en = ArticulatoryCombinedTextFrontend(language="en")
+    print(tfr_en.string_to_tensor("This is a complex sentence, it even has a pause! But can it do this? Nice.", view=True))
+    tfr_en = ArticulatoryCombinedTextFrontend(language="de")
+    print(tfr_en.string_to_tensor("Alles klar, jetzt testen wir einen deutschen Satz. Ich hoffe es gibt nicht mehr viele unspezifizierte Phoneme.", view=True))