DebasishDhal99 commited on
Commit
6cfbfef
1 Parent(s): 2e3d8ee

Create turkish.py

Browse files
Files changed (1) hide show
  1. turkish.py +67 -0
turkish.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ special_combs = {"c" : "ј", "C" : "Ј"} #These are Serbian J characters, they will be later converted to Latin J.
2
+
3
+ turkish_dict = {
4
+
5
+ "ç" : "ch" , "Ç" : "Ch",
6
+ "ğ" : "'" , "Ğ" : "'",
7
+ "ı" : "e" , "I": "E",
8
+ "i" : "i" , "İ": "I",
9
+ "j" : "zh" , "J": "zh",
10
+ "ö" : "o" , "Ö" : "O",
11
+ "ş" : "sh" , "Ş" : "Sh",
12
+ "ü" : "yu" , "Ü" : "Yu",
13
+ "w" : "v" , "W" : "V",
14
+ }
15
+
16
+ cyrillic_equiv_dict = {
17
+ "ј" : "j" , "Ј" : "J",
18
+ "ў" : "w"
19
+ }
20
+
21
+ def check_special_comb(word):
22
+ for comb in special_combs:
23
+ if comb in word:
24
+ word = word.replace(comb,special_combs[comb])
25
+ return word
26
+
27
+ def cyrillic_to_eng(word):
28
+ for cyrillic in cyrillic_equiv_dict:
29
+ if cyrillic in word:
30
+ word = word.replace(cyrillic,cyrillic_equiv_dict[cyrillic])
31
+ return word
32
+
33
+ def turkish_letter_to_eng(letter):
34
+ if letter in turkish_dict:
35
+ return turkish_dict[letter]
36
+ else:
37
+ return letter
38
+
39
+
40
+ def turkish_word_to_latin(word):
41
+ assert type(word)==str
42
+ word = check_special_comb(word)
43
+
44
+ if word.endswith("ı"):
45
+ word = word[:-1] + "aў"
46
+ if word.endswith("er"):
47
+ word = word[:-2]+"ar"
48
+
49
+ word = check_special_comb(word)
50
+
51
+ word = ''.join([turkish_letter_to_eng(letter) for letter in word])
52
+ word = cyrillic_to_eng(word)
53
+ return word
54
+
55
+
56
+ def turkish_sentence_to_latin(sentence):
57
+ word_list = word_tokenize(sentence)
58
+ processed_word_list = []
59
+
60
+ for word in word_list:
61
+ try:
62
+ input_word = word
63
+ processed_word_list.append(turkish_word_to_latin(word))
64
+ except:
65
+ processed_word_list.append(input_word)
66
+
67
+ return " ".join(processed_word_list)