DebasishDhal99
commited on
Commit
•
6cfbfef
1
Parent(s):
2e3d8ee
Create turkish.py
Browse files- turkish.py +67 -0
turkish.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
special_combs = {"c" : "ј", "C" : "Ј"} #These are Serbian J characters, they will be later converted to Latin J.
|
2 |
+
|
3 |
+
turkish_dict = {
|
4 |
+
|
5 |
+
"ç" : "ch" , "Ç" : "Ch",
|
6 |
+
"ğ" : "'" , "Ğ" : "'",
|
7 |
+
"ı" : "e" , "I": "E",
|
8 |
+
"i" : "i" , "İ": "I",
|
9 |
+
"j" : "zh" , "J": "zh",
|
10 |
+
"ö" : "o" , "Ö" : "O",
|
11 |
+
"ş" : "sh" , "Ş" : "Sh",
|
12 |
+
"ü" : "yu" , "Ü" : "Yu",
|
13 |
+
"w" : "v" , "W" : "V",
|
14 |
+
}
|
15 |
+
|
16 |
+
cyrillic_equiv_dict = {
|
17 |
+
"ј" : "j" , "Ј" : "J",
|
18 |
+
"ў" : "w"
|
19 |
+
}
|
20 |
+
|
21 |
+
def check_special_comb(word):
|
22 |
+
for comb in special_combs:
|
23 |
+
if comb in word:
|
24 |
+
word = word.replace(comb,special_combs[comb])
|
25 |
+
return word
|
26 |
+
|
27 |
+
def cyrillic_to_eng(word):
|
28 |
+
for cyrillic in cyrillic_equiv_dict:
|
29 |
+
if cyrillic in word:
|
30 |
+
word = word.replace(cyrillic,cyrillic_equiv_dict[cyrillic])
|
31 |
+
return word
|
32 |
+
|
33 |
+
def turkish_letter_to_eng(letter):
|
34 |
+
if letter in turkish_dict:
|
35 |
+
return turkish_dict[letter]
|
36 |
+
else:
|
37 |
+
return letter
|
38 |
+
|
39 |
+
|
40 |
+
def turkish_word_to_latin(word):
|
41 |
+
assert type(word)==str
|
42 |
+
word = check_special_comb(word)
|
43 |
+
|
44 |
+
if word.endswith("ı"):
|
45 |
+
word = word[:-1] + "aў"
|
46 |
+
if word.endswith("er"):
|
47 |
+
word = word[:-2]+"ar"
|
48 |
+
|
49 |
+
word = check_special_comb(word)
|
50 |
+
|
51 |
+
word = ''.join([turkish_letter_to_eng(letter) for letter in word])
|
52 |
+
word = cyrillic_to_eng(word)
|
53 |
+
return word
|
54 |
+
|
55 |
+
|
56 |
+
def turkish_sentence_to_latin(sentence):
|
57 |
+
word_list = word_tokenize(sentence)
|
58 |
+
processed_word_list = []
|
59 |
+
|
60 |
+
for word in word_list:
|
61 |
+
try:
|
62 |
+
input_word = word
|
63 |
+
processed_word_list.append(turkish_word_to_latin(word))
|
64 |
+
except:
|
65 |
+
processed_word_list.append(input_word)
|
66 |
+
|
67 |
+
return " ".join(processed_word_list)
|