File size: 1,651 Bytes
6cfbfef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
special_combs = {"c" : "ј", "C" : "Ј"} #These are Serbian J characters, they will be later converted to Latin J.
turkish_dict = {
"ç" : "ch" , "Ç" : "Ch",
"ğ" : "'" , "Ğ" : "'",
"ı" : "e" , "I": "E",
"i" : "i" , "İ": "I",
"j" : "zh" , "J": "zh",
"ö" : "o" , "Ö" : "O",
"ş" : "sh" , "Ş" : "Sh",
"ü" : "yu" , "Ü" : "Yu",
"w" : "v" , "W" : "V",
}
cyrillic_equiv_dict = {
"ј" : "j" , "Ј" : "J",
"ў" : "w"
}
def check_special_comb(word):
for comb in special_combs:
if comb in word:
word = word.replace(comb,special_combs[comb])
return word
def cyrillic_to_eng(word):
for cyrillic in cyrillic_equiv_dict:
if cyrillic in word:
word = word.replace(cyrillic,cyrillic_equiv_dict[cyrillic])
return word
def turkish_letter_to_eng(letter):
if letter in turkish_dict:
return turkish_dict[letter]
else:
return letter
def turkish_word_to_latin(word):
assert type(word)==str
word = check_special_comb(word)
if word.endswith("ı"):
word = word[:-1] + "aў"
if word.endswith("er"):
word = word[:-2]+"ar"
word = check_special_comb(word)
word = ''.join([turkish_letter_to_eng(letter) for letter in word])
word = cyrillic_to_eng(word)
return word
def turkish_sentence_to_latin(sentence):
word_list = word_tokenize(sentence)
processed_word_list = []
for word in word_list:
try:
input_word = word
processed_word_list.append(turkish_word_to_latin(word))
except:
processed_word_list.append(input_word)
return " ".join(processed_word_list)
|