DebasishDhal99
commited on
Commit
•
d043c48
1
Parent(s):
0a1267d
Updating the tokenizer method from nltk to regular splitting
Browse files- turkish.py +2 -1
turkish.py
CHANGED
@@ -58,7 +58,8 @@ def turkish_word_to_latin(word):
|
|
58 |
|
59 |
|
60 |
def turkish_sentence_to_latin(sentence):
|
61 |
-
word_list = word_tokenize(sentence)
|
|
|
62 |
processed_word_list = []
|
63 |
|
64 |
for word in word_list:
|
|
|
58 |
|
59 |
|
60 |
def turkish_sentence_to_latin(sentence):
|
61 |
+
# word_list = word_tokenize(sentence) #Nltk tokenizer didn't work out as it is also splitting by sep = "'" sometimes. İstanbul'u becomes İstanbul ' u
|
62 |
+
word_list = sentence.split(" ")
|
63 |
processed_word_list = []
|
64 |
|
65 |
for word in word_list:
|