Spaces:

cdactvm
/

Hindi_ASR

Sleeping

App Files Files Community

Hindi_ASR / text2int.py

cdactvm

Upload 5 files

45f9ca7 verified 3 months ago

raw

history blame

10.5 kB

	#!/usr/bin/env python
	# coding: utf-8

	# In[ ]:


	# # Function to convert Hindi text to numerical representation
	# from isNumber import is_number

	# def text_to_int (textnum, numwords={}):
	# units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
	# 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
	# 'sixteen', 'seventeen', 'eighteen', 'nineteen']
	# tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
	# scales = ['hundred', 'thousand', 'lac','million', 'billion', 'trillion']
	# ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
	# ordinal_endings = [('ieth', 'y'), ('th', '')]

	# if not numwords:
	# numwords['and'] = (1, 0)
	# for idx, word in enumerate(units): numwords[word] = (1, idx)
	# for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
	# for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

	# textnum = textnum.replace('-', ' ')

	# current = result = 0
	# curstring = ''
	# onnumber = False
	# lastunit = False
	# lastscale = False

	# def is_numword(x):
	# if is_number(x):
	# return True
	# if word in numwords:
	# return True
	# return False

	# def from_numword(x):
	# if is_number(x):
	# scale = 0
	# increment = int(x.replace(',', ''))
	# return scale, increment
	# return numwords[x]

	# for word in textnum.split():
	# if word in ordinal_words:
	# scale, increment = (1, ordinal_words[word])
	# current = current * scale + increment
	# if scale > 100:
	# result += current
	# current = 0
	# onnumber = True
	# lastunit = False
	# lastscale = False
	# else:
	# for ending, replacement in ordinal_endings:
	# if word.endswith(ending):
	# word = "%s%s" % (word[:-len(ending)], replacement)

	# if (not is_numword(word)) or (word == 'and' and not lastscale):
	# if onnumber:
	# # Flush the current number we are building
	# curstring += repr(result + current) + " "
	# curstring += word + " "
	# result = current = 0
	# onnumber = False
	# lastunit = False
	# lastscale = False
	# else:
	# scale, increment = from_numword(word)
	# onnumber = True

	# if lastunit and (word not in scales):
	# # Assume this is part of a string of individual numbers to
	# # be flushed, such as a zipcode "one two three four five"
	# curstring += repr(result + current)
	# result = current = 0

	# if scale > 1:
	# current = max(1, current)

	# current = current * scale + increment
	# if scale > 100:
	# result += current
	# current = 0

	# lastscale = False
	# lastunit = False
	# if word in scales:
	# lastscale = True
	# elif word in units:
	# lastunit = True

	# if onnumber:
	# curstring += repr(result + current)

	# return curstring


	# In[ ]:


	from isNumber import is_number # Remove or replace this if unnecessary

	def text_to_int(textnum, numwords={}):
	# Define units, tens, and scales including "lac"
	units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
	'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
	'sixteen', 'seventeen', 'eighteen', 'nineteen']
	tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
	scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion'] # "lac" added
	ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}
	ordinal_endings = [('ieth', 'y'), ('th', '')]

	if not numwords:
	numwords['and'] = (1, 0) # Handle "one hundred and twenty"

	# Add units, tens, and scales to numwords
	for idx, word in enumerate(units):
	numwords[word] = (1, idx)
	for idx, word in enumerate(tens):
	numwords[word] = (1, idx * 10)

	for idx, word in enumerate(scales):
	numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0) # Handle "lac" as 10^5

	# Remove hyphens and normalize input
	textnum = textnum.replace('-', ' ')

	current = result = 0
	curstring = ''
	onnumber = False
	lastunit = False
	lastscale = False

	def is_numword(x):
	return is_number(x) or x in numwords

	def from_numword(x):
	if is_number(x):
	return 0, int(x.replace(',', ''))
	return numwords[x]

	for word in textnum.split():
	if word in ordinal_words:
	scale, increment = (1, ordinal_words[word])
	current = current * scale + increment
	if scale > 100:
	result += current
	current = 0
	onnumber = True
	lastunit = False
	lastscale = False
	else:
	for ending, replacement in ordinal_endings:
	if word.endswith(ending):
	word = f"{word[:-len(ending)]}{replacement}"

	if not is_numword(word) or (word == 'and' and not lastscale):
	if onnumber:
	curstring += repr(result + current) + " "
	curstring += word + " "
	result = current = 0
	onnumber = False
	lastunit = False
	lastscale = False
	else:
	scale, increment = from_numword(word)
	onnumber = True

	if lastunit and word not in scales:
	curstring += repr(result + current) + " "
	result = current = 0

	if scale > 1:
	current = max(1, current)

	current = current * scale + increment

	if scale >= 100:
	result += current
	current = 0

	lastscale = word in scales
	lastunit = word in units

	if onnumber:
	curstring += repr(result + current)

	return curstring.strip()


	# In[ ]: