samiulhaq
/

t-uren-iwslt

Model card Files Files and versions Community

t-uren-iwslt / indic_tokenize.py.py

SAMI

commit from sami

14b1d78 almost 2 years ago

3.47 kB

	#
	# Copyright (c) 2013-present, Anoop Kunchukuttan
	# All rights reserved.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.
	#

	#Program for tokenizing Indian language input
	#
	# @author Anoop Kunchukuttan
	#
	"""
	Tokenizer for Indian languages. Currently, simple punctuation-based tokenizers
	are supported (see `trivial_tokenize`). Major Indian language punctuations are
	handled.
	"""
	import string, re, sys

	from indicnlp.common import IndicNlpException

	### tokenizer patterns
	triv_tokenizer_indic_pat=re.compile(r'(['+string.punctuation+r'\u0964\u0965'+r'])')
	triv_tokenizer_urdu_pat=re.compile(r'(['+string.punctuation+r'\u0609\u060A\u060C\u061E\u066A\u066B\u066C\u066D\u06D4'+r'])')

	## date, numbers, section/article numbering
	pat_num_seq=re.compile(r'([0-9]+ [,.:/] )+[0-9]+')

	def trivial_tokenize_indic(text):
	"""tokenize string for Indian language scripts using Brahmi-derived scripts

	A trivial tokenizer which just tokenizes on the punctuation boundaries.
	This also includes punctuations for the Indian language scripts (the
	purna virama and the deergha virama). This is a language independent
	tokenizer

	Args:
	text (str): text to tokenize

	Returns:
	list: list of tokens

	"""
	tok_str=triv_tokenizer_indic_pat.sub(r' \1 ',text.replace('\t',' '))
	# return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ')

	s=re.sub(r'[ ]+',' ',tok_str).strip(' ')

	# do not tokenize numbers and dates
	new_s=''
	prev=0
	for m in pat_num_seq.finditer(s):
	start=m.start()
	end=m.end()
	if start>prev:
	new_s=new_s+s[prev:start]
	new_s=new_s+s[start:end].replace(' ','')
	prev=end

	new_s=new_s+s[prev:]
	s=new_s

	return s.split(' ')

	def trivial_tokenize_urdu(text):
	"""tokenize Urdu string

	A trivial tokenizer which just tokenizes on the punctuation boundaries.
	This also includes punctuations for the Urdu script.
	These punctuations characters were identified from the Unicode database
	for Arabic script by looking for punctuation symbols.

	Args:
	text (str): text to tokenize

	Returns:
	list: list of tokens
	"""
	tok_str=triv_tokenizer_urdu_pat.sub(r' \1 ',text.replace('\t',' '))
	return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ')

	def trivial_tokenize(text,lang='hi'):
	"""trivial tokenizer for Indian languages using Brahmi for Arabic scripts

	A trivial tokenizer which just tokenizes on the punctuation boundaries.
	Major punctuations specific to Indian langauges are handled.
	These punctuations characters were identified from the Unicode database.

	Args:
	text (str): text to tokenize
	lang (str): ISO 639-2 language code

	Returns:
	list: list of tokens
	"""
	if lang=='ur':
	return trivial_tokenize_urdu(text)
	else:
	return trivial_tokenize_indic(text)

	# if __name__ == '__main__':

	# if len(sys.argv)<4:
	# print("Usage: python indic_tokenize.py <infile> <outfile> <language>")
	# sys.exit(1)

	# with open(sys.argv[1],'r', encoding='utf-8') as ifile:
	# with open(sys.argv[2],'w', encoding='utf-8') as ofile:
	# for line in ifile:
	# tokenized_line=' '.join(trivial_tokenize(line,sys.argv[3]))
	# ofile.write(tokenized_line)