Spaces:

DebasishDhal99
/

The-Language-Transliteration-Project

Running

App Files Files Community

The-Language-Transliteration-Project / turkish.py

DebasishDhal99

Debugging Turkish end char pronunciation

718e9ec over 1 year ago

raw

history blame

3.01 kB

	# import nltk
	# nltk.download('punkt')
	# from nltk.tokenize import word_tokenize
	import string

	special_combs = {"c" : "ј", "C" : "Ј"} #These are Serbian J characters, they will be later converted to Latin J.

	turkish_dict = {

	"ç" : "ch" , "Ç" : "Ch",
	"ğ" : "'" , "Ğ" : "'",
	"ı" : "e" , "I": "E",
	"i" : "i" , "İ": "I",
	"j" : "zh" , "J": "zh",
	"ö" : "o" , "Ö" : "O",
	"ş" : "sh" , "Ş" : "Sh",
	"ü" : "yu" , "Ü" : "Yu",
	"w" : "v" , "W" : "V",
	}

	cyrillic_equiv_dict = {
	"ј" : "j" , "Ј" : "J",
	"ў" : "w"
	}

	def check_special_comb(word):
	for comb in special_combs:
	if comb in word:
	word = word.replace(comb,special_combs[comb])
	return word

	def cyrillic_to_eng(word):
	for cyrillic in cyrillic_equiv_dict:
	if cyrillic in word:
	word = word.replace(cyrillic,cyrillic_equiv_dict[cyrillic])
	return word

	def turkish_letter_to_eng(letter):
	if letter in turkish_dict:
	return turkish_dict[letter]
	else:
	return letter

	def check_punc(word): #The pronunciation of some Turkish chars change if they're at the string end. However, puncutation marks like ., etc can hamper this.
	punc_list = list(string.punctuation)

	ending_punc = False
	how_many_punc_end = 0
	#Modify it such that even it there are > 1 puncutaiton marks at the word ending, it'll detect that

	for punc in punc_list:
	if word.endswith(punc):
	ending_punc = True
	break

	for i in range(1,len(word)):
	if word[-i] in punc_list:
	how_many_punc_end += 1

	if word[-i] not in punc_list:
	break
	return ending_punc, how_many_punc_end


	def turkish_word_to_latin(word):
	assert type(word)==str

	last_letter = ""

	ending_punc, how_many_punc_end = check_punc(word) #If a word ends with a puncutation letter. "deniz." will be True. How many puncutation chars. "deniz!!!" = 3

	if ending_punc:
	last_letter = word[-1*how_many_punc_end:]
	word = word[:-1*how_many_punc_end]

	word = check_special_comb(word)

	if word.endswith("ı"):
	word = word[:-1] + "aў"
	if word.endswith("er"):
	word = word[:-2]+"ar"

	word = check_special_comb(word)

	word = ''.join([turkish_letter_to_eng(letter) for letter in word])
	word = cyrillic_to_eng(word)

	if last_letter: #If a punctuation mark is there in the original word, it'll be added here.
	word = word + last_letter
	return word


	def turkish_sentence_to_latin(sentence):
	# word_list = word_tokenize(sentence) #Nltk tokenizer didn't work out as it is also splitting by sep = "'" sometimes. İstanbul'u becomes İstanbul ' u
	word_list = sentence.split(" ")
	processed_word_list = []

	for word in word_list:
	try:
	input_word = word
	processed_word_list.append(turkish_word_to_latin(word))
	except:
	processed_word_list.append(input_word)

	return " ".join(processed_word_list)