File size: 3,006 Bytes
04ac83f
 
 
718e9ec
a216a89
6cfbfef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ed0334
 
 
 
718e9ec
 
 
4ed0334
 
 
718e9ec
 
4ed0334
 
 
718e9ec
 
 
4ed0334
 
 
6cfbfef
 
4ed0334
 
 
 
 
 
 
 
 
6cfbfef
 
 
 
 
 
 
 
 
 
 
4ed0334
 
 
6cfbfef
 
 
 
d043c48
 
6cfbfef
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# import nltk
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize
import string

special_combs = {"c" : "ј", "C" : "Ј"} #These are Serbian J characters, they will be later converted to Latin J.

turkish_dict = {

"ç" : "ch" , "Ç" : "Ch",
"ğ" : "'" , "Ğ" : "'", 
"ı" : "e" , "I": "E", 
"i" : "i" , "İ": "I",
"j" : "zh" , "J": "zh",
"ö" : "o" , "Ö" : "O",
"ş" : "sh" , "Ş" : "Sh",
"ü" : "yu" , "Ü" : "Yu",
"w" : "v" , "W" : "V",
}

cyrillic_equiv_dict = {
    "ј" : "j" , "Ј" : "J",
    "ў" : "w"
}

def check_special_comb(word):
    for comb in special_combs:
        if comb in word:
            word = word.replace(comb,special_combs[comb])
    return word

def cyrillic_to_eng(word):
    for cyrillic in cyrillic_equiv_dict:
        if cyrillic in word:
            word = word.replace(cyrillic,cyrillic_equiv_dict[cyrillic])
    return word

def turkish_letter_to_eng(letter):
    if letter in turkish_dict:
        return turkish_dict[letter]
    else:
        return letter

def check_punc(word): #The pronunciation of some Turkish chars change if they're at the string end. However, puncutation marks like ., etc can hamper this.
    punc_list = list(string.punctuation)

    ending_punc = False
    how_many_punc_end = 0
    #Modify it such that even it there are > 1 puncutaiton marks at the word ending, it'll detect that
    
    for punc in punc_list:
        if word.endswith(punc):
            ending_punc = True
            break
    
    for i in range(1,len(word)):
        if word[-i] in punc_list:
            how_many_punc_end += 1

        if word[-i] not in punc_list:
            break
    return ending_punc, how_many_punc_end

    
def turkish_word_to_latin(word):
    assert type(word)==str

    last_letter = ""

    ending_punc, how_many_punc_end = check_punc(word) #If a word ends with a puncutation letter. "deniz." will be True. How many puncutation chars. "deniz!!!" = 3

    if ending_punc:
        last_letter = word[-1*how_many_punc_end:]
        word = word[:-1*how_many_punc_end]

    word = check_special_comb(word)

    if word.endswith("ı"):
        word = word[:-1] + "aў"
    if word.endswith("er"):
        word = word[:-2]+"ar"

    word = check_special_comb(word)

    word = ''.join([turkish_letter_to_eng(letter) for letter in word])
    word = cyrillic_to_eng(word)

    if last_letter: #If a punctuation mark is there in the original word, it'll be added here. 
        word = word + last_letter
    return word


def turkish_sentence_to_latin(sentence):
    # word_list = word_tokenize(sentence) #Nltk tokenizer didn't work out as it is also splitting by sep = "'" sometimes. İstanbul'u becomes İstanbul ' u
    word_list = sentence.split(" ")
    processed_word_list = []

    for word in word_list:
        try:
            input_word = word
            processed_word_list.append(turkish_word_to_latin(word))
        except:
            processed_word_list.append(input_word)

    return " ".join(processed_word_list)