|
|
|
|
|
|
|
import string |
|
|
|
special_combs = {"c" : "ј", "C" : "Ј"} |
|
|
|
turkish_dict = { |
|
|
|
"ç" : "ch" , "Ç" : "Ch", |
|
"ğ" : "'" , "Ğ" : "'", |
|
"ı" : "e" , "I": "E", |
|
"i" : "i" , "İ": "I", |
|
"j" : "zh" , "J": "zh", |
|
"ö" : "o" , "Ö" : "O", |
|
"ş" : "sh" , "Ş" : "Sh", |
|
"ü" : "yu" , "Ü" : "Yu", |
|
"w" : "v" , "W" : "V", |
|
} |
|
|
|
cyrillic_equiv_dict = { |
|
"ј" : "j" , "Ј" : "J", |
|
"ў" : "w" |
|
} |
|
|
|
def check_special_comb(word): |
|
for comb in special_combs: |
|
if comb in word: |
|
word = word.replace(comb,special_combs[comb]) |
|
return word |
|
|
|
def cyrillic_to_eng(word): |
|
for cyrillic in cyrillic_equiv_dict: |
|
if cyrillic in word: |
|
word = word.replace(cyrillic,cyrillic_equiv_dict[cyrillic]) |
|
return word |
|
|
|
def turkish_letter_to_eng(letter): |
|
if letter in turkish_dict: |
|
return turkish_dict[letter] |
|
else: |
|
return letter |
|
|
|
def check_punc(word): |
|
punc_list = list(string.punctuation) |
|
|
|
ending_punc = False |
|
how_many_punc_end = 0 |
|
|
|
|
|
for punc in punc_list: |
|
if word.endswith(punc): |
|
ending_punc = True |
|
break |
|
|
|
for i in range(1,len(word)): |
|
if word[-i] in punc_list: |
|
how_many_punc_end += 1 |
|
|
|
if word[-i] not in punc_list: |
|
break |
|
return ending_punc, how_many_punc_end |
|
|
|
|
|
def turkish_word_to_latin(word): |
|
assert type(word)==str |
|
|
|
last_letter = "" |
|
|
|
ending_punc, how_many_punc_end = check_punc(word) |
|
|
|
if ending_punc: |
|
last_letter = word[-1*how_many_punc_end:] |
|
word = word[:-1*how_many_punc_end] |
|
|
|
word = check_special_comb(word) |
|
|
|
if word.endswith("ı"): |
|
word = word[:-1] + "aў" |
|
if word.endswith("er"): |
|
word = word[:-2]+"ar" |
|
|
|
word = check_special_comb(word) |
|
|
|
word = ''.join([turkish_letter_to_eng(letter) for letter in word]) |
|
word = cyrillic_to_eng(word) |
|
|
|
if last_letter: |
|
word = word + last_letter |
|
return word |
|
|
|
|
|
def turkish_sentence_to_latin(sentence): |
|
|
|
word_list = sentence.split(" ") |
|
processed_word_list = [] |
|
|
|
for word in word_list: |
|
try: |
|
input_word = word |
|
processed_word_list.append(turkish_word_to_latin(word)) |
|
except: |
|
processed_word_list.append(input_word) |
|
|
|
return " ".join(processed_word_list) |
|
|