#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' 2022.06.03 kazakh_to_ipa() <> ipa_to_kazakh() test_kazakh() turkish_to_ipa() <> ipa_to_turkish() test_turkish() 2022.07.05 kyrgyz_to_ipa() <> ipa_to_kyrgyz() test_kyrgyz() uzbek_to_ipa() <> ipa_to_uzbek() test_uzbek() azerbaijani_to_ipa() <> ipa_to_azerbaijani() test_azerbaijani() turkmen_to_ipa() <> ipa_to_turkmen() test_turkmen() 2022.07.07 tatar_to_ipa() <> ipa_to_tatar() test_tatar() bashkir_to_ipa() <> ipa_to_bashkir() test_bashkir() sakha_to_ipa() <> ipa_to_sakha() test_sakha() 2022.07.12 experimentally added î and â to turkish_to_ipa() 2022.08.04 uyghur_to_ipa() <> ipa_to_uyghur() ''' import re # kazakh scripts def kazakh_to_ipa(text): # we shall begin with sound combinations: # the longer a combination, the upper it is on the list. # single sounds should go to the bottom, with consonants taking precedence over vowels. # consonants are less likely to change than vowels. # for convenience, we shall use single symbols to denote multiple-sound combinations. # we can later convert them to conventional symbols. # three-sound convenience vowels: text = re.sub("[Юю]", "ǔ", text) # two-sound convenience consonants: text = re.sub("[Цц]", "š", text) text = re.sub("[Чч]", "ʆ", text) # two-sound convenience vowels: text = re.sub("[Яя]", "ǎ", text) text = re.sub("[Ее]", "ě", text) text = re.sub("[Ёё]", "ǒ", text) text = re.sub("[Ии]", "ǐ", text) text = re.sub("[Уу]", "u", text) # single-sound consonants: text = re.sub("[Бб]", "b", text) text = re.sub("[Вв]", "v", text) text = re.sub("[Гг]", "g", text) text = re.sub("[Ғғ]", "ɣ", text) text = re.sub("[Дд]", "d", text) text = re.sub("[Жж]", "ʒ", text) text = re.sub("[Зз]", "z", text) text = re.sub("[Йй]", "j", text) text = re.sub("[Кк]", "k", text) text = re.sub("[Ққ]", "q", text) text = re.sub("[Лл]", "l", text) text = re.sub("[Мм]", "m", text) text = re.sub("[Нн]", "n", text) text = re.sub("[Ңң]", "ŋ", text) text = re.sub("[Пп]", "p", text) text = re.sub("[Рр]", "r", text) text = re.sub("[Сс]", "s", text) text = re.sub("[Тт]", "t", text) text = re.sub("[Фф]", "f", text) text = re.sub("[Хх]", "x", text) text = re.sub("[Һһ]", "h", text) text = re.sub("[Шш]", "ʃ", text) text = re.sub("[Щщ]", "ɕ", text) text = re.sub("[Ъъ]", "ʔ", text) text = re.sub("[Ьь]", "ʲ", text) # single-sound vowels: text = re.sub("[Аа]", "ɑ", text) text = re.sub("[Әә]", "æ", text) text = re.sub("[Оо]", "ɔ", text) text = re.sub("[Өө]", "ɵ", text) text = re.sub("[Ұұ]", "ʊ", text) text = re.sub("[Үү]", "ʏ", text) text = re.sub("[Ыы]", "ɤ", text) text = re.sub("[Іі]", "ɪ", text) text = re.sub("[Ээ]", "e", text) # rules ''' rule 1: if [æ], [ě], [ɵ], [ʏ], [ɪ] are followed by [l] and [l] is NOT followed by [æ], [ě], [ɵ], [ʏ], [ɪ], or [ʲ], use [ł] instead of [l] (e.g., [kěł], [kěłdɪ], but [kělěmɪn], [marsělʲ]). ''' text = re.sub(r"([æěɵʏɪ])(l)([^æěɵʏɪʲ])", r"\1ł\3", text) ''' rule 2: if the letters "о" and "ө", [ɔ] and [ɵ] at the beginning of a word are preceded by [w] (e.g., осы [wɔsɤ] not [ɔsɤ], өзі [wɵzɪ] not [ɵzɪ]). ''' text = re.sub(r"\b([ɔɵ])", r"w\1", text) ''' rule 3 if the letter "у" [u] is followed by consonants, use [w] instead of [u]. ''' text = re.sub(r"u([bvgɣdʒzjkqlłmnŋprstfxhʃɕʔšʆʲ])", r"w\1", text) ''' rule 4: if the letter "и" [ǐ] is followed by consonants, use [i] instead of [ǐ]. ''' text = re.sub(r"ǐ([bvgɣdʒzjkqlłmnŋprstfxhʃɕʔšʆʲ])", r"i\1", text) return text def ipa_to_kazakh(text): # three-sound convenience vowels: text = re.sub("ǔ", "ю", text) # two-sound convenience consonants: text = re.sub("š", "ц", text) text = re.sub("ʆ", "ч", text) # two-sound convenience vowels: text = re.sub("ǎ", "я", text) text = re.sub("ě", "е", text) text = re.sub("ǒ", "ё", text) text = re.sub("ǐ", "и", text) text = re.sub("u", "у", text) # single-sound consonants: text = re.sub("b", "б", text) text = re.sub("v", "в", text) text = re.sub("g", "г", text) text = re.sub("ɣ", "ғ", text) text = re.sub("d", "д", text) text = re.sub("ʒ", "ж", text) text = re.sub("z", "з", text) text = re.sub("j", "й", text) text = re.sub("k", "к", text) text = re.sub("q", "қ", text) text = re.sub("l", "л", text) text = re.sub("m", "м", text) text = re.sub("n", "н", text) text = re.sub("ŋ", "ң", text) text = re.sub("p", "п", text) text = re.sub("r", "р", text) text = re.sub("s", "с", text) text = re.sub("t", "т", text) text = re.sub("f", "ф", text) text = re.sub("x", "х", text) text = re.sub("h", "һ", text) text = re.sub("ʃ", "ш", text) text = re.sub("ɕ", "щ", text) text = re.sub("ʔ", "ъ", text) text = re.sub("ʲ", "ь", text) # single-sound vowels: text = re.sub("ɑ", "а", text) text = re.sub("æ", "ә", text) text = re.sub("ɔ", "о", text) text = re.sub("ɵ", "ө", text) text = re.sub("ʊ", "ұ", text) text = re.sub("ʏ", "ү", text) text = re.sub("ɤ", "ы", text) text = re.sub("ɪ", "і", text) text = re.sub("e", "э", text) # anti-rules ''' anti-rule 1: ''' text = re.sub(r"([әеөүі])(ł)([^әеөүіь])", r"\1л\3", text) ''' anti-rule 2: ''' text = re.sub(r"\bw([оө])", r"\1", text) ''' anti-rule 3: ''' text = re.sub(r"w([бвгғджзйкқлмнңпрстфхһцчшщъьчц])", r"у\1", text) ''' anti-rule 4: the symbol [i] is used in one case only, so we can just replace it for и. ''' text = re.sub(r"i", r"и", text) ''' anti-rules for Turkish and Kyrgyz Ǯ, Turkish ł, azerbaijani ḡ, sakha ɲ ''' text = re.sub(r"w([Ǯ])", r"у\1", text) text = re.sub(r"Ǯ", r"дж", text) text = re.sub(r"ł", r"ль", text) text = re.sub(r"ḡ", r"гь", text) text = re.sub(r"ɲ", r"нь", text) return text # testing kazakh scripts def test_kazakh(text): input_text = text.lower().split() output_text = ipa_to_kazakh(kazakh_to_ipa(text)).split() input_difference = [] output_difference = [] for item in input_text: if item not in output_text: input_difference.append(item) for item in output_text: if item not in input_text: output_difference.append(item) if input_text == output_text: print("input text and output text -- identical") else: print("input text and output text -- different") print("input:", input_difference) print("output:", output_difference) # turkish scripts def turkish_to_ipa(text): # we shall begin with sound combinations: # the longer a combination, the upper it is on the list. # single sounds should go to the bottom, with consonants taking precedence over vowels. # consonants are less likely to change than vowels. # for convenience, we shall use single symbols to denote multiple-sound combinations. # we can later convert them to conventional symbols. # two-sound convenience consonants: text = re.sub("[Cc]", "Ǯ", text) text = re.sub("[Çç]", "ʆ", text) # two-sound convenience vowels: text = re.sub("[İi]", "ǐ", text) text = re.sub("[Uu]", "u", text) # single-sound consonants: text = re.sub("[Jj]", "ʒ", text) text = re.sub("[Yy]", "j", text) text = re.sub("[Bb]", "b", text) text = re.sub("[Dd]", "d", text) text = re.sub("[Ff]", "f", text) text = re.sub("[Gg]", "g", text) text = re.sub("[Ğğ]", "ɣ", text) text = re.sub("[Hh]", "h", text) text = re.sub("[Kk]", "k", text) text = re.sub("[Ll]", "l", text) text = re.sub("[Mm]", "m", text) text = re.sub("[Nn]", "n", text) text = re.sub("[Pp]", "p", text) text = re.sub("[Rr]", "r", text) text = re.sub("[Ss]", "s", text) text = re.sub("[Şş]", "ʃ", text) text = re.sub("[Tt]", "t", text) text = re.sub("[Vv]", "v", text) text = re.sub("[Zz]", "z", text) # single-sound vowels: text = re.sub("[Aa]", "ɑ", text) text = re.sub("[Ee]", "e", text) text = re.sub("[Iı]", "ɤ", text) text = re.sub("[Oo]", "ɔ", text) text = re.sub("[Öö]", "ɵ", text) text = re.sub("[Üü]", "ʏ", text) text = re.sub("[Îî]", "ǐ", text) # experimentally added text = re.sub("[Ââ]", "ɑ", text) # experimentally added ''' rule 1: if [e], [ɵ], [ʏ], [i] are followed by [l] and [l] is NOT followed by [e], [ɵ], [ʏ], or [i], use [ł] instead of [l] (e.g., [geł], [gełdi], but [gelecek]). ''' text = re.sub(r"([eɵʏǐ])(l)([^eɵʏǐ])", r"\1ł\3", text) ''' rule 2: if the letter "u" [u] is followed by consonants, use [w] instead of [u]. ''' text = re.sub(r"u([bvgɣdʒzklłmnprstfhʃʆǮ])", r"w\1", text) ''' rule 3: if the letter "i" [ǐ] is followed by consonants, use [i] instead of [ǐ]. ''' text = re.sub(r"ǐ([bvgɣdʒzklłmnprstfhʃʆǮ])", r"i\1", text) return text def ipa_to_turkish(text): # two-sound convenience consonants: text = re.sub("Ǯ", "c", text) text = re.sub("ʆ", "ç", text) # single-sound consonants: text = re.sub("j", "y", text) text = re.sub("ʒ", "j", text) text = re.sub("b", "b", text) text = re.sub("d", "d", text) text = re.sub("f", "f", text) text = re.sub("g", "g", text) text = re.sub("ɣ", "ğ", text) text = re.sub("h", "h", text) text = re.sub("k", "k", text) text = re.sub("l", "l", text) text = re.sub("m", "m", text) text = re.sub("n", "n", text) text = re.sub("p", "p", text) text = re.sub("r", "r", text) text = re.sub("s", "s", text) text = re.sub("ʃ", "ş", text) text = re.sub("t", "t", text) text = re.sub("v", "v", text) text = re.sub("z", "z", text) # single-sound vowels: text = re.sub("ɑ", "a", text) text = re.sub("e", "e", text) text = re.sub("ɤ", "ı", text) text = re.sub("ǐ", "i", text) text = re.sub("ɔ", "o", text) text = re.sub("ɵ", "ö", text) text = re.sub("ʏ", "ü", text) ''' anti-rule 1: ''' text = re.sub(r"([eöüi])(ł)([^eöüi])", r"\1l\3", text) ''' anti-rule 2: the symbol [w] is used in one case only, so we can just replace it for u. ''' text = re.sub(r"w", r"u", text) ''' anti-rule 3: the symbol [i] is used in one case only, so we can just replace it for i. ''' text = re.sub(r"i", r"i", text) return text # testing turkish scripts def test_turkish(text): input_text = text.lower().split() output_text = ipa_to_turkish(turkish_to_ipa(text)).split() input_difference = [] output_difference = [] for item in input_text: if item not in output_text: input_difference.append(item) for item in output_text: if item not in input_text: output_difference.append(item) if input_text == output_text: print("input text and output text -- identical") else: print("input text and output text -- different") print("input:", input_difference) print("output:", output_difference) # kyrgyz scripts def kyrgyz_to_ipa(text): # we shall begin with sound combinations: # the longer a combination, the upper it is on the list # single sounds should go to the bottom, with consonants taking precedence over vowels. # consonants are less likely to change than vowels. # for convenience, we shall use single symbols to denote multiple-sound combinations. # we can later convert them to conventional symbols. # three-sound convenience vowels: text = re.sub("[Юю]", "ǔ", text) # two-sound convenience consonants: text = re.sub("[Цц]", "š", text) text = re.sub("[Чч]", "ʆ", text) text = re.sub("[Жж]", "Ǯ", text) # two-sound convenience vowels: text = re.sub("[Яя]", "ǎ", text) text = re.sub("[Ее]", "ě", text) text = re.sub("[Ёё]", "ǒ", text) text = re.sub("[Ии]", "ǐ", text) text = re.sub("[Уу]", "u", text) # single-sound consonants: text = re.sub("[Бб]", "b", text) text = re.sub("[Вв]", "v", text) text = re.sub("[Гг]", "g", text) text = re.sub("[Дд]", "d", text) text = re.sub("[Зз]", "z", text) text = re.sub("[Йй]", "j", text) text = re.sub("[Кк]", "k", text) text = re.sub("[Лл]", "l", text) text = re.sub("[Мм]", "m", text) text = re.sub("[Нн]", "n", text) text = re.sub("[Ңң]", "ŋ", text) text = re.sub("[Пп]", "p", text) text = re.sub("[Рр]", "r", text) text = re.sub("[Сс]", "s", text) text = re.sub("[Тт]", "t", text) text = re.sub("[Фф]", "f", text) text = re.sub("[Хх]", "x", text) text = re.sub("[Шш]", "ʃ", text) text = re.sub("[Щщ]", "ɕ", text) text = re.sub("[Ъъ]", "ʔ", text) text = re.sub("[Ьь]", "ʲ", text) # single-sound vowels: text = re.sub("[Аа]", "ɑ", text) text = re.sub("[Оо]", "ɔ", text) text = re.sub("[Өө]", "ɵ", text) text = re.sub("[Үү]", "ʏ", text) text = re.sub("[Ыы]", "ɤ", text) text = re.sub("[Ээ]", "e", text) # rules 1-4 are similar to those for Kazakh: ''' rule 1: if [ě], [ɵ], [ʏ], are followed by [l] and [l] is NOT followed by [ě], [ɵ], [ʏ], or [ʲ], use [ł] instead of [l]. ''' text = re.sub(r"([ɵʏě])(l)([^ɵʏěʲ])", r"\1ł\3", text) ''' rule 2: if the letters "о" and "ө", [ɔ] and [ɵ] at the beginning of a word are followed by [w] (e.g., осы [wɔsɤ] not [ɔsɤ], өзі [wɵzɪ] not [ɵzɪ]). ''' text = re.sub(r"\b([ɔɵ])", r"w\1", text) ''' rule 3 if the letter "у" [u] is followed by consonants, use [w] instead of [u]. ''' text = re.sub(r"u([bvgɣdzjkqlłmnŋprstfxhʃɕʔšʆǮʲ])", r"w\1", text) ''' rule 4: if the letter "и" [ǐ] is followed by consonants, use [i] instead of [ǐ]. ''' text = re.sub(r"ǐ([bvgɣdzjkqlłmnŋprstfxhʃɕʔšʆǮʲ])", r"i\1", text) # rules 5-6 are specific to Kyrgyz: ''' rule 5 ɑ|ɔ|u|ɤ + k + ɑ|ɔ|u|ɤ ''' text = re.sub(r"([ɑɔwɤ])k", r"\1q", text) text = re.sub(r"k([ɑɔuɤ])", r"q\1", text) ''' rule 6 ɑ|ɔ|u|ɤ + g + ɑ|ɔ|u|ɤ ''' text = re.sub(r"([ɑɔwɤ])g", r"\1ɣ", text) text = re.sub(r"g([ɑɔuɤ])", r"ɣ\1", text) return text def ipa_to_kyrgyz(text): # three-sound convenience vowels: text = re.sub("ǔ", "ю", text) # two-sound convenience consonants: text = re.sub("š", "ц", text) text = re.sub("ʆ", "ч", text) text = re.sub("Ǯ", "ж", text) # two-sound convenience vowels: text = re.sub("ǎ", "я", text) text = re.sub("ě", "е", text) text = re.sub("ǒ", "ё", text) text = re.sub("ǐ", "и", text) text = re.sub("u", "у", text) # single-sound consonants: text = re.sub("b", "б", text) text = re.sub("v", "в", text) text = re.sub("g", "г", text) text = re.sub("ɣ", "г", text) text = re.sub("d", "д", text) text = re.sub("z", "з", text) text = re.sub("j", "й", text) text = re.sub("k", "к", text) text = re.sub("l", "л", text) text = re.sub("m", "м", text) text = re.sub("n", "н", text) text = re.sub("ŋ", "ң", text) text = re.sub("p", "п", text) text = re.sub("q", "к", text) text = re.sub("r", "р", text) text = re.sub("s", "с", text) text = re.sub("t", "т", text) text = re.sub("f", "ф", text) text = re.sub("x", "х", text) text = re.sub("ʃ", "ш", text) text = re.sub("ɕ", "щ", text) text = re.sub("ʔ", "ъ", text) text = re.sub("ʲ", "ь", text) # single-sound vowels: text = re.sub("ɑ", "а", text) text = re.sub("ɔ", "о", text) text = re.sub("ɵ", "ө", text) text = re.sub("ʏ", "ү", text) text = re.sub("ɤ", "ы", text) text = re.sub("e", "э", text) # anti-rules 1-4 are similar to those for Kazakh: ''' anti-rule 1: ''' text = re.sub(r"([өүе])(ł)([^өүеʲ])", r"\1л\3", text) ''' anti-rule 2: ''' text = re.sub(r"\bw([оө])", r"\1", text) ''' anti-rule 3: ''' text = re.sub(r"w([бвгдзйклмнңпрстфхцчшщъьчцж])", r"у\1", text) ''' anti-rule 4: ''' text = re.sub(r"i([бвгдзйклмнңпрстфхцчшщъьчцж])", r"и\1", text) return text # testing kyrgyz scripts def test_kyrgyz(text): input_text = text.lower().split() output_text = ipa_to_kyrgyz(kyrgyz_to_ipa(text)).split() input_difference = [] output_difference = [] for item in input_text: if item not in output_text: input_difference.append(item) for item in output_text: if item not in input_text: output_difference.append(item) if input_text == output_text: print("input text and output text -- identical") else: print("input text and output text -- different") print("input:", input_difference) print("output:", output_difference) # uzbek scripts def uzbek_to_ipa(text): # we shall begin with sound combinations: # the longer a combination, the upper it is on the list # single sounds should go to the bottom, with consonants taking precedence over vowels. # consonants are less likely to change than vowels. # for convenience, we shall use single symbols to denote multiple-sound combinations. # we can later convert them to conventional symbols. # two-sound convenience consonants: text = re.sub("[Jj]", "Ǯ", text) text = re.sub("Ch", "ʆ", text) text = re.sub("ch", "ʆ", text) # two-sound convenience vowels: text = re.sub("[Ii]", "ǐ", text) text = re.sub("[Uu]", "u", text) # single-sound consonants: text = re.sub("[Bb]", "b", text) text = re.sub("[Dd]", "d", text) text = re.sub("[Ff]", "f", text) text = re.sub("G‘", "ɣ", text) text = re.sub("g‘", "ɣ", text) text = re.sub("[Gg]", "g", text) text = re.sub("[Hh]", "h", text) text = re.sub("[Kk]", "k", text) text = re.sub("[Ll]", "l", text) text = re.sub("[Mm]", "m", text) text = re.sub("[Nn]", "n", text) text = re.sub("Ng", "ŋ", text) text = re.sub("ng", "ŋ", text) text = re.sub("[Pp]", "p", text) text = re.sub("[Qq]", "q", text) text = re.sub("[Rr]", "r", text) text = re.sub("[Ss]", "s", text) text = re.sub("Sh", "ʃ", text) text = re.sub("sh", "ʃ", text) text = re.sub("[Tt]", "t", text) text = re.sub("[Vv]", "v", text) text = re.sub("[Xx]", "x", text) text = re.sub("[Yy]", "j", text) text = re.sub("[Zz]", "z", text) # single-sound vowels: text = re.sub("[Aa]", "æ", text) text = re.sub("[Ee]", "e", text) text = re.sub("Oʻ", "ɵ", text) text = re.sub("oʻ", "ɵ", text) text = re.sub("[Oo]", "ɔ", text) # hard sign text = re.sub("'", "ʔ", text) ''' rule 1: if [æ], [e], [ɵ], [ǐ] are followed by [l] and [l] is NOT followed by [æ], [e], [ɵ], [ǐ], use [ł] instead of [l]. ''' text = re.sub(r"([æɵǐe])(l)([^æɵǐe])", r"\1ł\3", text) ''' rule 2: if the letter "u" [u] is followed by consonants, use [w] instead of [u]. ''' text = re.sub(r"u([bvgɣdzjkqlłmnŋprstfxhʃʔʆǮ])", r"w\1", text) ''' rule 3: if the letter "i" [ǐ] is followed by consonants, use [i] instead of [ǐ]. ''' text = re.sub(r"ǐ([bvgɣdzjkqlłmnŋprstfxhʃʔʆǮ])", r"i\1", text) return text def ipa_to_uzbek(text): # two-sound convenience consonants: text = re.sub("j", "y", text) # exception! precedence issue text = re.sub("Ǯ", "j", text) text = re.sub("ʆ", "ch", text) # two-sound convenience vowels: text = re.sub("ǐ", "i", text) text = re.sub("u", "u", text) # single-sound convenience consonants: text = re.sub("b", "b", text) text = re.sub("d", "d", text) text = re.sub("f", "f", text) text = re.sub("g", "g", text) text = re.sub("ɣ", "g‘", text) text = re.sub("h", "h", text) text = re.sub("k", "k", text) text = re.sub("l", "l", text) text = re.sub("m", "m", text) text = re.sub("n", "n", text) text = re.sub("ŋ", "ng", text) text = re.sub("p", "p", text) text = re.sub("q", "q", text) text = re.sub("r", "r", text) text = re.sub("s", "s", text) text = re.sub("ʃ", "sh", text) text = re.sub("t", "t", text) text = re.sub("v", "v", text) text = re.sub("x", "x", text) text = re.sub("z", "z", text) # single-sound convenience vowels: text = re.sub("æ", "a", text) text = re.sub("e", "e", text) text = re.sub("ɵ", "o‘", text) text = re.sub("ɔ", "o", text) # hard sign text = re.sub("ʔ", "'", text) ''' anti-rule 1: ''' text = re.sub(r"([aei‘])(ł)([^aei‘])", r"\1l\3", text) ''' anti-rule 2: ''' text = re.sub(r"w([bcvgɣdjzklmnpqrstfhyx])", r"u\1", text) ''' anti-rule 3: ''' text = re.sub(r"i([bcvgɣdjzklmnpqrstfhyx])", r"i\1", text) return text # testing uzbek scripts def test_uzbek(text): input_text = text.lower().split() output_text = ipa_to_uzbek(uzbek_to_ipa(text)).split() input_difference = [] output_difference = [] for item in input_text: if item not in output_text: input_difference.append(item) for item in output_text: if item not in input_text: output_difference.append(item) if input_text == output_text: print("input text and output text -- identical") else: print("input text and output text -- different") print("input:", input_difference) print("output:", output_difference) # azerbaijani scripts def azerbaijani_to_ipa(text): # we shall begin with sound combinations: # the longer a combination, the upper it is on the list # single sounds should go to the bottom, with consonants taking precedence over vowels. # consonants are less likely to change than vowels. # for convenience, we shall use single symbols to denote multiple-sound combinations. # we can later convert them to conventional symbols. # two-sound convenience consonants: text = re.sub("[Cc]", "Ǯ", text) text = re.sub("[Çç]", "ʆ", text) text = re.sub("[Gg]", "ḡ", text) # two-sound convenience vowels: text = re.sub("[İi]", "ǐ", text) text = re.sub("[Uu]", "u", text) # single-sound consonants: text = re.sub("[Jj]", "ʒ", text) text = re.sub("[Yy]", "j", text) text = re.sub("[Bb]", "b", text) text = re.sub("[Dd]", "d", text) text = re.sub("[Ff]", "f", text) text = re.sub("[Ğğ]", "ɣ", text) text = re.sub("[Hh]", "h", text) text = re.sub("[Xx]", "x", text) text = re.sub("[Kk]", "k", text) text = re.sub("[Qq]", "g", text) text = re.sub("[Ll]", "l", text) text = re.sub("[Mm]", "m", text) text = re.sub("[Nn]", "n", text) text = re.sub("[Pp]", "p", text) text = re.sub("[Rr]", "r", text) text = re.sub("[Ss]", "s", text) text = re.sub("[Şş]", "ʃ", text) text = re.sub("[Tt]", "t", text) text = re.sub("[Vv]", "v", text) text = re.sub("[Zz]", "z", text) # single-sound vowels: text = re.sub("[Aa]", "ɑ", text) text = re.sub("[Ee]", "e", text) text = re.sub("[Əə]", "æ", text) text = re.sub("[Iı]", "ɤ", text) text = re.sub("[Oo]", "ɔ", text) text = re.sub("[Öö]", "ɵ", text) text = re.sub("[Üü]", "ʏ", text) ''' rule 1: if [æ], [e], [ɵ], [ʏ], [i] are followed by [l] and [l] is NOT followed by [æ], [e], [ɵ], [ʏ], or [i], use [ł] instead of [l]. ''' text = re.sub(r"([æeɵʏǐ])(l)([^æeɵʏǐ])", r"\1ł\3", text) ''' rule 2: if the letter "u" [ʊw] is followed by consonants, use [w] instead of [u]. ''' text = re.sub(r"u([bvgḡɣdʒzklłmnprstfhxʃʆǮ])", r"w\1", text) ''' rule 3: if the letter "i" [ǐ] is followed by consonants, use [i] instead of [ǐ]. ''' text = re.sub(r"ǐ([bvgḡɣdʒzklłmnprstfhxʃʆǮ])", r"i\1", text) return text def ipa_to_azerbaijani(text): # two-sound convenience consonants: text = re.sub("Ǯ", "c", text) text = re.sub("ʆ", "ç", text) text = re.sub("g", "q", text) # precedence issue text = re.sub("ḡ", "g", text) # single-sound consonants: text = re.sub("j", "y", text) text = re.sub("ʒ", "j", text) text = re.sub("b", "b", text) text = re.sub("d", "d", text) text = re.sub("f", "f", text) text = re.sub("ɣ", "ğ", text) text = re.sub("h", "h", text) text = re.sub("x", "x", text) text = re.sub("k", "k", text) text = re.sub("l", "l", text) text = re.sub("m", "m", text) text = re.sub("n", "n", text) text = re.sub("p", "p", text) text = re.sub("r", "r", text) text = re.sub("s", "s", text) text = re.sub("ʃ", "ş", text) text = re.sub("t", "t", text) text = re.sub("v", "v", text) text = re.sub("z", "z", text) # single-sound vowels: text = re.sub("ɑ", "a", text) text = re.sub("e", "e", text) text = re.sub("æ", "ə", text) text = re.sub("ɤ", "ı", text) text = re.sub("ǐ", "i", text) text = re.sub("ɔ", "o", text) text = re.sub("ɵ", "ö", text) text = re.sub("ʏ", "ü", text) ''' anti-rule 1: ''' text = re.sub(r"([əeöüiě])(ł)([^əeöüiě])", r"\1l\3", text) ''' anti-rule 2: ''' text = re.sub(r"w([bvgğdjzkqlmnprstfhxşçc])", r"u\1", text) ''' anti-rule 3: ''' text = re.sub(r"i([bcvgğdjzkqlmnprstfhxşç])", r"i\1", text) return text # testing azerbaijani scripts def test_azerbaijani(text): input_text = text.lower().split() output_text = ipa_to_azerbaijani(azerbaijani_to_ipa(text)).split() input_difference = [] output_difference = [] for item in input_text: if item not in output_text: input_difference.append(item) for item in output_text: if item not in input_text: output_difference.append(item) if input_text == output_text: print("input text and output text -- identical") else: print("input text and output text -- different") print("input:", input_difference) print("output:", output_difference) # turkmen scripts def turkmen_to_ipa(text): # we shall begin with sound combinations: # the longer a combination, the upper it is on the list # single sounds should go to the bottom, with consonants taking precedence over vowels. # consonants are less likely to change than vowels. # for convenience, we shall use single symbols to denote multiple-sound combinations. # we can later convert them to conventional symbols. # two-sound convenience consonants: text = re.sub("[Çç]", "ʆ", text) text = re.sub("[Jj]", "Ǯ", text) # two-sound convenience vowels: text = re.sub("[İi]", "ǐ", text) text = re.sub("[Uu]", "u", text) # single-sound consonants: text = re.sub("[Bb]", "b", text) text = re.sub("[Dd]", "d", text) text = re.sub("[Ff]", "f", text) text = re.sub("[Gg]", "g", text) text = re.sub("[Hh]", "h", text) text = re.sub("[Žž]", "ʒ", text) text = re.sub("[Kk]", "k", text) text = re.sub("[Ll]", "l", text) text = re.sub("[Mm]", "m", text) text = re.sub("[Nn]", "n", text) text = re.sub("[Ňň]", "ŋ", text) text = re.sub("[Pp]", "p", text) text = re.sub("[Rr]", "r", text) text = re.sub("[Ss]", "s", text) # θ text = re.sub("[Şş]", "ʃ", text) text = re.sub("[Tt]", "t", text) text = re.sub("[Ww]", "v", text) text = re.sub("[Ýý]", "j", text) text = re.sub("[Zz]", "z", text) # ð # single-sound vowels: text = re.sub("[Aa]", "ɑ", text) text = re.sub("[Ää]", "æ", text) text = re.sub("[Ee]", "e", text) text = re.sub("[Oo]", "ɔ", text) text = re.sub("[Öö]", "ɵ", text) text = re.sub("[Üü]", "ʏ", text) text = re.sub("[Yy]", "ɤ", text) # rules: ''' rule 1: if [æ], [e], [ɵ], [ʏ], [i] are followed by [l] and [l] is NOT followed by [æ], [e], [ɵ], [ʏ], or [i], use [ł] instead of [l]. ''' text = re.sub(r"([æeɵʏǐ])(l)([^æeɵʏǐ])", r"\1ł\3", text) ''' rule 2: if the letter "u" [ʊw] is followed by consonants, use [w] instead of [u]. ''' text = re.sub(r"u([bvgɣqdʒzkqlłmnprstfhʃʆǮw])", r"w\1", text) ''' rule 3: if the letter "i" [ǐ] is followed by consonants, use [i] instead of [ǐ]. ''' text = re.sub(r"ǐ([bvgɣqdʒzkqlłmnprstfhʃʆǮ])", r"i\1", text) # rules 4-5 are specific to Turkmen: ''' rule 4: a, o, u, y + k + a, o, u, y: ''' text = re.sub(r"k([ɑɔuɤ])", r"q\1", text) text = re.sub(r"([ɑɔwɤ])k", r"\1q", text) ''' rule 5: a, o, u, y + g + a, o, u, y: ''' text = re.sub(r"g([ɑɔuɤ])", r"ɣ\1", text) text = re.sub(r"([ɑɔwɤ])g", r"\1ɣ", text) return text def ipa_to_turkmen(text): # two-sound convenience consonants: text = re.sub("j", "ý", text) # precedence issue text = re.sub("Ǯ", "j", text) text = re.sub("ʆ", "ç", text) # single-sound consonants: # w --> v can be found where the letter u anti-rule is text = re.sub("b", "b", text) text = re.sub("d", "d", text) text = re.sub("f", "f", text) text = re.sub("g", "g", text) text = re.sub("ɣ", "g", text) text = re.sub("h", "h", text) text = re.sub("ʒ", "ž", text) text = re.sub("k", "k", text) text = re.sub("q", "k", text) text = re.sub("l", "l", text) text = re.sub("m", "m", text) text = re.sub("n", "n", text) text = re.sub("ŋ", "ň", text) text = re.sub("p", "p", text) text = re.sub("r", "r", text) text = re.sub("s", "s", text) text = re.sub("ʃ", "ş", text) text = re.sub("t", "t", text) text = re.sub("z", "z", text) # single-sound vowels: text = re.sub("ɑ", "a", text) text = re.sub("e", "e", text) text = re.sub("æ", "ä", text) text = re.sub("ǐ", "i", text) text = re.sub("ɔ", "o", text) text = re.sub("ɵ", "ö", text) text = re.sub("ʏ", "ü", text) text = re.sub("ɤ", "y", text) # anti-rules: ''' anti-rule 1: ''' text = re.sub(r"([äeöüiě])(ł)([^äeöüiě])", r"\1l\3", text) ''' anti-rule 2: ''' text = re.sub(r"w([bdfghžklmnňprsştýzjçɣqv])", r"u\1", text) # precedence issue text = re.sub("v", "w", text) # precedence issue ''' anti-rule 3: ''' text = re.sub(r"i([bdfghžklmnňprsştwýzjçɣq])", r"i\1", text) return text # testing turkmen scripts def test_turkmen(text): input_text = text.lower().split() output_text = ipa_to_turkmen(turkmen_to_ipa(text)).split() input_difference = [] output_difference = [] for item in input_text: if item not in output_text: input_difference.append(item) for item in output_text: if item not in input_text: output_difference.append(item) if input_text == output_text: print("input text and output text -- identical") else: print("input text and output text -- different") print("input:", input_difference) print("output:", output_difference) # tatar scripts def tatar_to_ipa(text): # we shall begin with sound combinations: # the longer a combination, the upper it is on the list # single sounds should go to the bottom, with consonants taking precedence over vowels. # consonants are less likely to change than vowels. # for convenience, we shall use single symbols to denote multiple-sound combinations. # we can later convert them to conventional symbols. # three-sound convenience vowels: text = re.sub("[Юю]", "ǔ", text) # two-sound convenience consonants: text = re.sub("[Цц]", "š", text) text = re.sub("[Чч]", "ʆ", text) text = re.sub("[Җҗ]", "Ǯ", text) # two-sound convenience vowels: text = re.sub("[Яя]", "ǎ", text) text = re.sub("[Ее]", "ě", text) text = re.sub("[Ёё]", "ǒ", text) text = re.sub("[Ии]", "ǐ", text) text = re.sub("[Уу]", "u", text) # single-sound consonants: text = re.sub("[Бб]", "b", text) text = re.sub("[Вв]", "v", text) text = re.sub("[Гг]", "g", text) text = re.sub("[Дд]", "d", text) text = re.sub("[Жж]", "ʒ", text) text = re.sub("[Зз]", "z", text) text = re.sub("[Йй]", "j", text) text = re.sub("[Кк]", "k", text) text = re.sub("[Лл]", "l", text) text = re.sub("[Мм]", "m", text) text = re.sub("[Нн]", "n", text) text = re.sub("[Ңң]", "ŋ", text) text = re.sub("[Пп]", "p", text) text = re.sub("[Рр]", "r", text) text = re.sub("[Сс]", "s", text) text = re.sub("[Тт]", "t", text) text = re.sub("[Фф]", "f", text) text = re.sub("[Хх]", "x", text) text = re.sub("[Һһ]", "h", text) text = re.sub("[Шш]", "ʃ", text) text = re.sub("[Щщ]", "ɕ", text) text = re.sub("[Ъъ]", "ʔ", text) text = re.sub("[Ьь]", "ʲ", text) # single-sound vowels: text = re.sub("[Аа]", "ɑ", text) text = re.sub("[Әә]", "æ", text) text = re.sub("[Оо]", "ɔ", text) text = re.sub("[Өө]", "ɵ", text) text = re.sub("[Үү]", "ʏ", text) text = re.sub("[Ыы]", "ɤ", text) text = re.sub("[Ээ]", "e", text) # rules 1-4 are similar to those for Kazakh: ''' rule 1: if [ě], [ɵ], [ʏ], are followed by [l] and [l] is NOT followed by [ě], [ɵ], [ʏ], or [ʲ], use [ł] instead of [l]. ''' text = re.sub(r"([æɵʏě])(l)([^æɵʏěʲ])", r"\1ł\3", text) ''' rule 2: if the letters "о" and "ө", [ɔ] and [ɵ] at the beginning of a word are followed by [w] (e.g., осы [wɔsɤ] not [ɔsɤ], өзі [wɵzɪ] not [ɵzɪ]). ''' text = re.sub(r"\b([ɔɵ])", r"w\1", text) ''' rule 3 if the letter "у" [u] is followed by consonants, use [w] instead of [u]. ''' text = re.sub(r"u([bvgɣdʒzjkqlłmnŋprstfxhʃɕʔšʆǮʲ])", r"w\1", text) ''' rule 4: if the letter "и" [ǐ] is followed by consonants, use [i] instead of [ǐ]. ''' text = re.sub(r"ǐ([bvgɣdʒzjkqlłmnŋprstfxhʃɕʔšʆǮʲ])", r"i\1", text) # rules 5-6 are specific to Tatar: ''' rule 5: а, о, у, ы, ъ + к + а, о, у, ы, ъ ''' text = re.sub(r"k([ɑɔwɤʔ])", r"q\1", text) text = re.sub(r"([ɑɔwɤʔ])k", r"\1q", text) ''' rule 6: а, о, у, ы, ъ + г + а, о, у, ы, ъ ''' text = re.sub(r"g([ɑɔwɤʔ])", r"ɣ\1", text) text = re.sub(r"([ɑɔwɤʔ])g", r"\1ɣ", text) return text def ipa_to_tatar(text): # three-sound convenience vowels: text = re.sub("ǔ", "ю", text) # two-sound convenience consonants: text = re.sub("š", "ц", text) text = re.sub("ʆ", "ч", text) text = re.sub("Ǯ", "җ", text) # two-sound convenience vowels: text = re.sub("ǎ", "я", text) text = re.sub("ě", "е", text) text = re.sub("ǒ", "ё", text) text = re.sub("ǐ", "и", text) text = re.sub("u", "у", text) # single-sound consonants: text = re.sub("b", "б", text) text = re.sub("v", "в", text) text = re.sub("g", "г", text) text = re.sub("ɣ", "г", text) text = re.sub("d", "д", text) text = re.sub("ʒ", "ж", text) text = re.sub("z", "з", text) text = re.sub("j", "й", text) text = re.sub("k", "к", text) text = re.sub("l", "л", text) text = re.sub("m", "м", text) text = re.sub("n", "н", text) text = re.sub("ŋ", "ң", text) text = re.sub("p", "п", text) text = re.sub("q", "к", text) text = re.sub("r", "р", text) text = re.sub("s", "с", text) text = re.sub("t", "т", text) text = re.sub("f", "ф", text) text = re.sub("x", "х", text) text = re.sub("h", "һ", text) text = re.sub("ʃ", "ш", text) text = re.sub("ɕ", "щ", text) text = re.sub("ʔ", "ъ", text) text = re.sub("ʲ", "ь", text) # single-sound vowels: text = re.sub("ɑ", "а", text) text = re.sub("æ", "ә", text) text = re.sub("ɔ", "о", text) text = re.sub("ɵ", "ө", text) text = re.sub("ʏ", "ү", text) text = re.sub("ɤ", "ы", text) text = re.sub("e", "э", text) # anti-rules 1-4 are similar to those for Kazakh: ''' anti-rule 1: ''' text = re.sub(r"([әөүе])(ł)([^әөүеʲ])", r"\1л\3", text) ''' anti-rule 2: ''' text = re.sub(r"\bw([оө])", r"\1", text) ''' anti-rule 3: ''' text = re.sub(r"w([бвгдзйклмнңпрстфхһцчшщъьчцжҗqɣ])", r"у\1", text) ''' anti-rule 4: ''' text = re.sub(r"i([бвгдзйклмнңпрстфхһцчшщъьчцжҗqɣ])", r"и\1", text) return text # testing tatar scripts def test_tatar(text): input_text = text.lower().split() output_text = ipa_to_tatar(tatar_to_ipa(text)).split() input_difference = [] output_difference = [] for item in input_text: if item not in output_text: input_difference.append(item) for item in output_text: if item not in input_text: output_difference.append(item) if input_text == output_text: print("input text and output text -- identical") else: print("input text and output text -- different") print("input:", input_difference) print("output:", output_difference) # bashkir scripts def bashkir_to_ipa(text): # we shall begin with sound combinations: # the longer a combination, the upper it is on the list # single sounds should go to the bottom, with consonants taking precedence over vowels. # consonants are less likely to change than vowels. # for convenience, we shall use single symbols to denote multiple-sound combinations. # we shall later convert them to conventional symbols. # three-sound convenience vowels: text = re.sub("[Юю]", "ǔ", text) # two-sound convenience consonants: text = re.sub("[Цц]", "š", text) text = re.sub("[Чч]", "ʆ", text) # two-sound convenience vowels: text = re.sub("[Яя]", "ǎ", text) text = re.sub("[Ее]", "ě", text) text = re.sub("[Ёё]", "ǒ", text) text = re.sub("[Ии]", "ǐ", text) text = re.sub("[Уу]", "u", text) # single-sound consonants: text = re.sub("[Бб]", "b", text) text = re.sub("[Вв]", "v", text) text = re.sub("[Гг]", "g", text) text = re.sub("[Ғғ]", "ɣ", text) text = re.sub("[Дд]", "d", text) text = re.sub("[Ҙҙ]", "z", text) text = re.sub("[Жж]", "ʒ", text) text = re.sub("[Зз]", "z", text) text = re.sub("[Йй]", "j", text) text = re.sub("[Кк]", "k", text) text = re.sub("[Ҡҡ]", "q", text) text = re.sub("[Лл]", "l", text) text = re.sub("[Мм]", "m", text) text = re.sub("[Нн]", "n", text) text = re.sub("[Ңң]", "ŋ", text) text = re.sub("[Пп]", "p", text) text = re.sub("[Рр]", "r", text) text = re.sub("[Сс]", "s", text) text = re.sub("[Ҫҫ]", "s", text) text = re.sub("[Тт]", "t", text) text = re.sub("[Хх]", "x", text) text = re.sub("[Фф]", "f", text) text = re.sub("[Һһ]", "h", text) text = re.sub("[Шш]", "ʃ", text) text = re.sub("[Щщ]", "ɕ", text) text = re.sub("[Ъъ]", "ʔ", text) text = re.sub("[Ьь]", "ʲ", text) # single-sound vowels: text = re.sub("[Аа]", "ɑ", text) text = re.sub("[Әә]", "æ", text) text = re.sub("[Оо]", "ɔ", text) text = re.sub("[Өө]", "ɵ", text) text = re.sub("[Үү]", "ʏ", text) text = re.sub("[Ыы]", "ɤ", text) text = re.sub("[Ээ]", "e", text) # rules 1-4 are similar to those for Kazakh: ''' rule 1: if [ě], [ɵ], [ʏ], are followed by [l] and [l] is NOT followed by [ě], [ɵ], [ʏ], or [ʲ], use [ł] instead of [l]. ''' text = re.sub(r"([æɵʏě])(l)([^æɵʏěʲ])", r"\1ł\3", text) ''' rule 2: if the letters "о" and "ө", [ɔ] and [ɵ] at the beginning of a word are followed by [w] (e.g., осы [wɔsɤ] not [ɔsɤ], өзі [wɵzɪ] not [ɵzɪ]). ''' text = re.sub(r"\b([ɔɵ])", r"w\1", text) ''' rule 3 if the letter "у" [u] is followed by consonants, use [w] instead of [u]. ''' text = re.sub(r"u([bvgɣdʒzjkqlłmnŋprstfxhʃɕʔšʆʲ])", r"w\1", text) ''' rule 4: if the letter "и" [ǐ] is followed by consonants, use [i] instead of [ǐ]. ''' text = re.sub(r"ǐ([bvgɣdʒzjkqlłmnŋprstfxhʃɕʔšʆʲ])", r"i\1", text) return text def ipa_to_bashkir(text): # three-sound convenience vowels: text = re.sub("ǔ", "ю", text) # two-sound convenience consonants: text = re.sub("š", "ц", text) text = re.sub("ʆ", "ч", text) # two-sound convenience vowels: text = re.sub("ě", "е", text) text = re.sub("ǒ", "ё", text) text = re.sub("ǐ", "и", text) text = re.sub("u", "у", text) text = re.sub("ǎ", "я", text) # single-sound consonants: text = re.sub("b", "б", text) text = re.sub("v", "в", text) text = re.sub("g", "г", text) text = re.sub("ɣ", "ғ", text) text = re.sub("d", "д", text) text = re.sub("z", "з", text) text = re.sub("ʒ", "ж", text) text = re.sub("j", "й", text) text = re.sub("k", "к", text) text = re.sub("q", "ҡ", text) text = re.sub("l", "л", text) text = re.sub("m", "м", text) text = re.sub("n", "н", text) text = re.sub("ŋ", "ң", text) text = re.sub("p", "п", text) text = re.sub("r", "р", text) text = re.sub("s", "с", text) text = re.sub("t", "т", text) text = re.sub("f", "ф", text) text = re.sub("x", "х", text) text = re.sub("h", "һ", text) text = re.sub("ʃ", "ш", text) text = re.sub("ɕ", "щ", text) text = re.sub("ʔ", "ъ", text) text = re.sub("ʲ", "ь", text) # single-sound vowels: text = re.sub("ɑ", "а", text) text = re.sub("æ", "ә", text) text = re.sub("ɔ", "о", text) text = re.sub("ɵ", "ө", text) text = re.sub("ʏ", "ү", text) text = re.sub("ɤ", "ы", text) text = re.sub("e", "э", text) # anti-rules 1-4 are similar to those for Kazakh: ''' anti-rule 1: ''' text = re.sub(r"([әөүе])(ł)([^әөүеʲ])", r"\1л\3", text) ''' anti-rule 2: ''' text = re.sub(r"\bw([оө])", r"\1", text) ''' anti-rule 3: ''' text = re.sub(r"w([бвгғдзйкҡлмнңпрстфхһцчшщъьчцж])", r"у\1", text) ''' anti-rule 4: ''' text = re.sub(r"i([бвгғдзйкҡлмнңпрстфхһцчшщъьчцж])", r"и\1", text) return text # testing bashkir scripts def test_bashkir(text): input_text = text.lower().split() output_text = ipa_to_bashkir(bashkir_to_ipa(text)).split() input_difference = [] output_difference = [] for item in input_text: if item not in output_text: input_difference.append(item) for item in output_text: if item not in input_text: output_difference.append(item) if input_text == output_text: print("input text and output text -- identical") else: print("input text and output text -- different") print("input:", input_difference) print("output:", output_difference) # sakha scripts def sakha_to_ipa(text): # we shall begin with sound combinations: # the longer a combination, the upper it is on the list # single sounds should go to the bottom, with consonants taking precedence over vowels. # consonants are less likely to change than vowels. # for convenience, we shall use single symbols to denote multiple-sound combinations. # we can later convert them to conventional symbols. # three-sound convenience vowels: text = re.sub("[Юю]", "ǔ", text) # two-sound convenience consonants: text = re.sub("[Цц]", "š", text) text = re.sub("[Чч]", "ʆ", text) text = re.sub("ДЬ", "Ǯ", text) text = re.sub("дь", "Ǯ", text) text = re.sub("Дь", "Ǯ", text) text = re.sub("дЬ", "Ǯ", text) text = re.sub("НЬ", "ɲ", text) text = re.sub("нь", "ɲ", text) text = re.sub("Нь", "ɲ", text) text = re.sub("нЬ", "ɲ", text) # two-sound convenience vowels: text = re.sub("[Яя]", "ǎ", text) text = re.sub("[Ее]", "ě", text) text = re.sub("[Ёё]", "ǒ", text) text = re.sub("[Ии]", "ǐ", text) text = re.sub("[Уу]", "u", text) # single-sound consonants: text = re.sub("[Бб]", "b", text) text = re.sub("[Вв]", "v", text) text = re.sub("[Гг]", "g", text) text = re.sub("[Ҕҕ]", "ɣ", text) text = re.sub("[Дд]", "d", text) text = re.sub("[Жж]", "ʒ", text) text = re.sub("[Зз]", "z", text) text = re.sub("[Йй]", "j", text) text = re.sub("[Кк]", "k", text) text = re.sub("[Лл]", "l", text) text = re.sub("[Мм]", "m", text) text = re.sub("[Нн]", "n", text) text = re.sub("[Ҥҥ]", "ŋ", text) text = re.sub("[Пп]", "p", text) text = re.sub("[Рр]", "r", text) text = re.sub("[Сс]", "s", text) text = re.sub("[Тт]", "t", text) text = re.sub("[Хх]", "x", text) text = re.sub("[Фф]", "f", text) text = re.sub("[Һһ]", "h", text) text = re.sub("[Шш]", "ʃ", text) text = re.sub("[Щщ]", "ɕ", text) text = re.sub("[Ъъ]", "ʔ", text) text = re.sub("[Ьь]", "ʲ", text) # single-sound vowels: text = re.sub("[Аа]", "ɑ", text) text = re.sub("[Әә]", "æ", text) text = re.sub("[Оо]", "ɔ", text) text = re.sub("[Өө]", "ɵ", text) text = re.sub("[Үү]", "ʏ", text) text = re.sub("[Ыы]", "ɤ", text) text = re.sub("[Ээ]", "e", text) # rules 1-4 are similar to those for Kazakh: ''' rule 1: if [ě], [ɵ], [ʏ], are followed by [l] and [l] is NOT followed by [ě], [ɵ], [ʏ], or [ʲ], use [ł] instead of [l]. ''' text = re.sub(r"([æɵʏě])(l)([^æɵʏěʲ])", r"\1ł\3", text) ''' rule 2: if the letters "о" and "ө", [ɔ] and [ɵ] at the beginning of a word are followed by [w] (e.g., осы [wɔsɤ] not [ɔsɤ], өзі [wɵzɪ] not [ɵzɪ]). ''' text = re.sub(r"\b([ɔɵ])", r"w\1", text) ''' rule 3 if the letter "у" [u] is followed by consonants, use [w] instead of [u]. ''' text = re.sub(r"u([bvgɣdʒzjklłmnŋɲprstfxhʃɕʔšʆǮʲ])", r"w\1", text) ''' rule 4: if the letter "и" [ǐ] is followed by consonants, use [i] instead of [ǐ]. ''' text = re.sub(r"ǐ([bvgɣdʒzjklłmnŋɲprstfxhʃɕʔšʆǮʲ])", r"i\1", text) return text def ipa_to_sakha(text): # three-sound convenience vowels: text = re.sub("ǔ", "ю", text) # two-sound convenience consonants: text = re.sub("š", "ц", text) text = re.sub("ʆ", "ч", text) text = re.sub("Ǯ", "дь", text) text = re.sub("ɲ", "нь", text) # two-sound convenience vowels: text = re.sub("ě", "е", text) text = re.sub("ǒ", "ё", text) text = re.sub("ǐ", "и", text) text = re.sub("u", "у", text) text = re.sub("ǎ", "я", text) # single-sound consonants: text = re.sub("b", "б", text) text = re.sub("v", "в", text) text = re.sub("g", "г", text) text = re.sub("ɣ", "ҕ", text) text = re.sub("d", "д", text) text = re.sub("z", "з", text) text = re.sub("ʒ", "ж", text) text = re.sub("j", "й", text) text = re.sub("k", "к", text) text = re.sub("l", "л", text) text = re.sub("m", "м", text) text = re.sub("n", "н", text) text = re.sub("ŋ", "ҥ", text) text = re.sub("p", "п", text) text = re.sub("r", "р", text) text = re.sub("s", "с", text) text = re.sub("t", "т", text) text = re.sub("f", "ф", text) text = re.sub("x", "х", text) text = re.sub("h", "һ", text) text = re.sub("ʃ", "ш", text) text = re.sub("ɕ", "щ", text) text = re.sub("ʔ", "ъ", text) text = re.sub("ʲ", "ь", text) # single-sound vowels: text = re.sub("ɑ", "а", text) text = re.sub("æ", "ә", text) text = re.sub("ɔ", "о", text) text = re.sub("ɵ", "ө", text) text = re.sub("ʏ", "ү", text) text = re.sub("ɤ", "ы", text) text = re.sub("e", "э", text) # anti-rules 1-4 are similar to those for Kazakh: ''' anti-rule 1: ''' text = re.sub(r"([әөүе])(ł)([^әөүеʲ])", r"\1л\3", text) ''' anti-rule 2: ''' text = re.sub(r"\bw([оө])", r"\1", text) ''' anti-rule 3: ''' text = re.sub(r"w(дь)", r"у\1", text) text = re.sub(r"w(нь)", r"у\1", text) text = re.sub(r"w([бвгҕдзйклмнҥпрстфхһцчшщъьчцж])", r"у\1", text) ''' anti-rule 4: ''' text = re.sub(r"i(дь)", r"и\1", text) text = re.sub(r"i(нь)", r"и\1", text) text = re.sub(r"i([бвгҕдзйклмнҥпрстфхһцчшщъьчцж])", r"и\1", text) return text # testing sakha scripts # testing bashkir scripts def test_sakha(text): input_text = text.lower().split() output_text = ipa_to_sakha(sakha_to_ipa(text)).split() input_difference = [] output_difference = [] for item in input_text: if item not in output_text: input_difference.append(item) for item in output_text: if item not in input_text: output_difference.append(item) if input_text == output_text: print("input text and output text -- identical") else: print("input text and output text -- different") print("input:", input_difference) print("output:", output_difference) # uyghur scripts def uyghur_to_ipa(text): # we shall begin with sound combinations: # the longer a combination, the upper it is on the list # single sounds should go to the bottom, with consonants taking precedence over vowels. # consonants are less likely to change than vowels. # for convenience, we shall use single symbols to denote multiple-sound combinations. # we can later convert them to conventional symbols. # two-sound convenience consonants: text = re.sub("[Jj]", "Ǯ", text) text = re.sub("Ch", "ʆ", text) text = re.sub("ch", "ʆ", text) # two-sound convenience vowels: text = re.sub("[Ii]", "ǐ", text) text = re.sub("[Uu]", "u", text) # single-sound consonants: text = re.sub("[Bb]", "b", text) text = re.sub("[Dd]", "d", text) text = re.sub("[Ff]", "f", text) text = re.sub("Gh", "ɣ", text) text = re.sub("gh", "ɣ", text) text = re.sub("[Gg]", "g", text) text = re.sub("[Hh]", "h", text) text = re.sub("[Kk]", "k", text) text = re.sub("[Ll]", "l", text) text = re.sub("[Mm]", "m", text) text = re.sub("[Nn]", "n", text) text = re.sub("Ng", "ŋ", text) text = re.sub("ng", "ŋ", text) text = re.sub("[Pp]", "p", text) text = re.sub("[Qq]", "q", text) text = re.sub("[Rr]", "r", text) text = re.sub("[Ss]", "s", text) text = re.sub("Sh", "ʃ", text) text = re.sub("sh", "ʃ", text) text = re.sub("[Tt]", "t", text) text = re.sub("[Ww]", "v", text) text = re.sub("[Xx]", "x", text) text = re.sub("[Yy]", "j", text) text = re.sub("[Zz]", "z", text) text = re.sub("Zh", "ʒ", text) text = re.sub("zh", "ʒ", text) # single-sound vowels: text = re.sub("[Aa]", "ɑ", text) text = re.sub("[Ee]", "æ", text) text = re.sub("[ËÉëé]", "e", text) text = re.sub("[Oo]", "ɔ", text) text = re.sub("[Öö]", "ɵ", text) text = re.sub("[Üü]", "ʏ", text) # hard sign text = re.sub("'", "ʔ", text) ''' rule 1: if [æ], [e], [ɵ], [ǐ] are followed by [l] and [l] is NOT followed by [æ], [e], [ɵ], [ǐ], use [ł] instead of [l]. ''' text = re.sub(r"([æɵǐeʏ])(l)([^æɵǐeʏ])", r"\1ł\3", text) ''' rule 2: if the letter "u" [u] is followed by consonants, use [w] instead of [u]. ''' text = re.sub(r"u([bvgɣdzjkqlłmnŋprstfxhʃʆǮʒ])", r"w\1", text) ''' rule 3: if the letter "i" [ǐ] is followed by consonants, use [i] instead of [ǐ]. ''' text = re.sub(r"ǐ([bvgɣdzjkqlłmnŋprstfxhʃʆǮʒ])", r"i\1", text) return text def ipa_to_uyghur(text): # two-sound convenience consonants: text = re.sub("j", "y", text) # exception! precedence issue text = re.sub("Ǯ", "j", text) text = re.sub("ʆ", "ch", text) text = re.sub("ʒ", "zh", text) # two-sound convenience vowels: text = re.sub("ǐ", "i", text) text = re.sub("u", "u", text) # single-sound convenience consonants: text = re.sub("b", "b", text) text = re.sub("d", "d", text) text = re.sub("f", "f", text) text = re.sub("g", "g", text) text = re.sub("ɣ", "gh", text) text = re.sub("h", "h", text) text = re.sub("k", "k", text) text = re.sub("l", "l", text) text = re.sub("m", "m", text) text = re.sub("n", "n", text) text = re.sub("ŋ", "ng", text) text = re.sub("p", "p", text) text = re.sub("q", "q", text) text = re.sub("r", "r", text) text = re.sub("s", "s", text) text = re.sub("ʃ", "sh", text) text = re.sub("t", "t", text) text = re.sub("v", "w", text) text = re.sub("x", "x", text) text = re.sub("z", "z", text) # single-sound convenience vowels: text = re.sub("ɑ", "a", text) text = re.sub("e", "ë", text) # precedence text = re.sub("æ", "e", text) text = re.sub("ɵ", "ö", text) text = re.sub("ɔ", "o", text) text = re.sub("ʏ", "ü", text) # hard sign text = re.sub("ʔ", "'", text) ''' anti-rule 1: ''' text = re.sub(r"([eëiöü])(ł)([^eëiöü])", r"\1l\3", text) ''' anti-rule 2: ''' text = re.sub(r"w([bcvgdjzklmnpqrstfhyx])", r"u\1", text) ''' anti-rule 3: ''' text = re.sub(r"i([bcvgdjzklmnpqrstfhyx])", r"i\1", text) return text # testing uyghur scripts def test_uyghur(text): input_text = text.lower().split() output_text = ipa_to_uyghur(uyghur_to_ipa(text)).split() input_difference = [] output_difference = [] for item in input_text: if item not in output_text: input_difference.append(item) for item in output_text: if item not in input_text: output_difference.append(item) if input_text == output_text: print("input text and output text -- identical") else: print("input text and output text -- different") print("input:", input_difference) print("output:", output_difference)