Spaces:

cdactvm
/

ENGLISH_ASR

Sleeping

App Files Files Community

cdactvm commited on Oct 8

Commit

3b58a97

•

1 Parent(s): 07ff57b

Upload 6 files

Browse files

Files changed (6) hide show

Text2List.py +57 -0
convert2list.py +87 -0
isNumber.py +16 -0
processDoubles.py +93 -0
replaceWords.py +156 -0
text2int.py +199 -0

Text2List.py ADDED Viewed

	@@ -0,0 +1,57 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[4]:
+def text_to_list():
+    text_list = [
+    # English numbers (11-19)
+    'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen',
+    # English multiples of ten (20, 30, ..., 90)
+    'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'seventy', 'seventy', 'seventy', 'seventy', 'seventy', 'eighty', 'ninety',
+    # English combinations of 21-29
+    'twenty one', 'twenty two', 'twenty three', 'twenty four', 'twenty five', 'twenty six', 'twenty seven', 'twenty eight', 'twenty nine',
+    # English combinations of 31-39
+    'thirty one', 'thirty two', 'thirty three', 'thirty four', 'thirty five', 'thirty six', 'thirty seven', 'thirty eight', 'thirty nine',
+    # English combinations of 41-49
+    'forty one', 'forty two', 'forty three', 'forty four', 'forty five', 'forty six', 'forty seven', 'forty eight', 'forty nine',
+    # English combinations of 51-59
+    'fifty one', 'fifty two', 'fifty three', 'fifty four', 'fifty five', 'fifty six', 'fifty seven', 'fifty eight', 'fifty nine',
+    # English combinations of 61-69
+    'sixty one', 'sixty two', 'sixty three', 'sixty four', 'sixty five', 'sixty six', 'sixty seven', 'sixty eight', 'sixty nine',
+    # English combinations of 71-79
+    'seventy one', 'seventy two', 'seventy three', 'seventy four', 'seventy five', 'seventy six', 'seventy seven', 'seventy eight', 'seventy nine',
+    # English combinations of 81-89
+    'eighty one', 'eighty two', 'eighty three', 'eighty four', 'eighty five', 'eighty six', 'eighty seven', 'eighty eight', 'eighty nine',
+    # English combinations of 91-99
+    'ninety one', 'ninety two', 'ninety three', 'ninety four', 'ninety five', 'ninety six', 'ninety seven', 'ninety eight', 'ninety nine',
+    # English numbers (0-10)
+    'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
+    # English for 100
+    'hundred',
+    # English for 1000
+    'thousand'
+    # English for 100000
+    'lakh'
+]
+    return text_list
+# In[ ]:

convert2list.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# import nbimporter
+import nbimporter
+from Text2List import text_to_list
+def convert_to_list(text, text_list):
+    matched_words = []
+    unmatched_text = ''  # To accumulate unmatched characters
+    # Sort text_list by length in descending order to prioritize longest matches first
+    text_list_sorted = sorted(text_list, key=len, reverse=True)
+    while text:
+        matched = False
+        for word in text_list_sorted:
+            if text.startswith(word):
+                # Add any accumulated unmatched text before appending the matched word
+                if unmatched_text:
+                    matched_words.append(unmatched_text)
+                    unmatched_text = ''  # Reset unmatched text accumulator
+                matched_words.append(word)
+                text = text[len(word):]  # Remove the matched part from text
+                matched = True
+                break
+        if not matched:
+            # Accumulate unmatched characters
+            unmatched_text += text[0]
+            text = text[1:]
+    # If there's any remaining unmatched text, add it to the result
+    if unmatched_text:
+        matched_words.append(unmatched_text)
+    # Join matched words and unmatched text with a space
+    result = ' '.join(matched_words)
+    return result
+# text = "जीरोएकदोतीनचारपांचछहसातआठनौदसजीरोएकदोतीनचारपांच"
+if __name__=="__main__":
+    converted=convert_to_list(text, text_to_list())
+    print(converted)
+# # import nbimporter
+# import nbimporter
+# from Text2List import text_to_list
+# def convert_to_list(text, text_list):
+#     matched_words = []
+#     unmatched_text = ''  # To accumulate unmatched characters
+#     # Sort text_list by length in descending order to prioritize longest matches first
+#     text_list_sorted = sorted(text_list, key=len, reverse=True)
+#     while text:
+#         matched = False
+#         for word in text_list_sorted:
+#             if word in text:
+#                 # Add any accumulated unmatched text before appending the matched word
+#                 if unmatched_text:
+#                     matched_words.append(unmatched_text)
+#                     unmatched_text = ''  # Reset unmatched text accumulator
+#                 matched_words.append(word)
+#                 text = text[len(word):]  # Remove the matched part from text
+#                 matched = True
+#                 break
+#         if not matched:
+#             # Accumulate unmatched characters
+#             unmatched_text += text[0]
+#             text = text[1:]
+#     # If there's any remaining unmatched text, add it to the result
+#     if unmatched_text:
+#         matched_words.append(unmatched_text)
+#     # Join matched words and unmatched text with a space
+#     result = ' '.join(matched_words)
+#     return result
+# text = "जीरोएकदोतीनचार"
+# if __name__=="__main__":
+#     converted=convert_to_list(text, text_to_list())
+#     print(converted)

isNumber.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[ ]:
+# Function to check if the string is a number
+def is_number(x):
+    if type(x) == str:
+        x = x.replace(',', '')
+    try:
+        float(x)
+    except:
+        return False
+    return True

processDoubles.py ADDED Viewed

	@@ -0,0 +1,93 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[1]:
+# # Function to process "double" followed by a number
+# def process_doubles(sentence):
+#     tokens = sentence.split()
+#     result = []
+#     i = 0
+#     while i < len(tokens):
+#         if tokens[i] == "double":
+#             if i + 1 < len(tokens):
+#                 result.append(tokens[i + 1])
+#                 result.append(tokens[i + 1])
+#                 i += 2
+#             else:
+#                 result.append(tokens[i])
+#                 i += 1
+#         else:
+#             result.append(tokens[i])
+#             i += 1
+#     return ' '.join(result)
+# In[ ]:
+# import re
+# def process_doubles(sentence):
+#     # Use regex to split 'डबल' followed by numbers/words without space (e.g., "डबलवन" -> "डबल वन")
+#     sentence = re.sub(r'(डबल)(\S+)', r'\1 \2', sentence)
+#     tokens = sentence.split()
+#     result = []
+#     i = 0
+#     while i < len(tokens):
+#         if tokens[i] == "डबल":
+#             if i + 1 < len(tokens):
+#                 result.append(tokens[i + 1])  # Append the next word/number
+#                 result.append(tokens[i + 1])  # Append the next word/number again to duplicate
+#                 i += 2  # Skip over the next word since it's already processed
+#             else:
+#                 result.append(tokens[i])
+#                 i += 1
+#         else:
+#             result.append(tokens[i])
+#             i += 1
+#     return ' '.join(result)
+# In[2]:
+# Function to process "double" and "triple" followed by a number
+def process_multiples(sentence):
+    tokens = sentence.split()
+    result = []
+    i = 0
+    while i < len(tokens):
+        if tokens[i] == "double":
+            if i + 1 < len(tokens):
+                result.append(tokens[i + 1])
+                result.append(tokens[i + 1])
+                i += 2
+            else:
+                result.append(tokens[i])
+                i += 1
+        elif tokens[i] == "triple":
+            if i + 1 < len(tokens):
+                result.append(tokens[i + 1])
+                result.append(tokens[i + 1])
+                result.append(tokens[i + 1])
+                i += 2
+            else:
+                result.append(tokens[i])
+                i += 1
+        else:
+            result.append(tokens[i])
+            i += 1
+    return ' '.join(result)
+# In[ ]:

replaceWords.py ADDED Viewed

	@@ -0,0 +1,156 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[7]:
+import re
+def replace_words(sentence):
+    # Define a dictionary mapping a single word to a list of words or phrases
+    replacement_map = {
+    # Multiples of ten
+    'twenty': [],
+    'thirty': [],
+    'forty': [],
+    'fifty': [],
+    'sixty': [],
+    'seventy': [],
+    'eighty': [],
+    'ninety': [],
+    # Numbers from 11 to 19
+    'eleven': [],
+    'twelve': [],
+    'thirteen': [],
+    'fourteen': [],
+    'fifteen': [],
+    'sixteen': [],
+    'seventeen': [],
+    'eighteen': [],
+    'nineteen': [],
+    # Numbers from 21 to 29
+    'twenty one': [],
+    'twenty two': [],
+    'twenty three': [],
+    'twenty four': [],
+    'twenty five': [],
+    'twenty six': [],
+    'twenty seven': [],
+    'twenty eight': [],
+    'twenty nine': [],
+    # Numbers from 31 to 39
+    'thirty one': [],
+    'thirty two': [],
+    'thirty three': [],
+    'thirty four': [],
+    'thirty five': [],
+    'thirty six': [],
+    'thirty seven': [],
+    'thirty eight': [],
+    'thirty nine': [],
+    # Numbers from 41 to 49
+    'forty one': [],
+    'forty two': [],
+    'forty three': [],
+    'forty four': [],
+    'forty five': [],
+    'forty six': [],
+    'forty seven': [],
+    'forty eight': [],
+    'forty nine': [],
+    # Numbers from 51 to 59
+    'fifty one': [],
+    'fifty two': [],
+    'fifty three': [],
+    'fifty four': [],
+    'fifty five': [],
+    'fifty six': [],
+    'fifty seven': [],
+    'fifty eight': [],
+    'fifty nine': [],
+    # Numbers from 61 to 69
+    'sixty one': [],
+    'sixty two': [],
+    'sixty three': [],
+    'sixty four': [],
+    'sixty five': [],
+    'sixty six': [],
+    'sixty seven': [],
+    'sixty eight': [],
+    'sixty nine': [],
+    # Numbers from 71 to 79
+    'seventy one': [],
+    'seventy two': [],
+    'seventy three': [],
+    'seventy four': [],
+    'seventy five': [],
+    'seventy six': [],
+    'seventy seven': [],
+    'seventy eight': [],
+    'seventy nine': [],
+    # Numbers from 81 to 89
+    'eighty one': [],
+    'eighty two': [],
+    'eighty three': [],
+    'eighty four': [],
+    'eighty five': [],
+    'eighty six': [],
+    'eighty seven': [],
+    'eighty eight': [],
+    'eighty nine': [],
+    # Numbers from 91 to 99
+    'ninety one': [],
+    'ninety two': [],
+    'ninety three': [],
+    'ninety four': [],
+    'ninety five': [],
+    'ninety six': [],
+    'ninety seven': [],
+    'ninety eight': [],
+    'ninety nine': [],
+    # Numbers from one to ten
+    'seven': [],
+    'zero': [],
+    'one': [],
+    'two': [],
+    'three': [],
+    'four': [],
+    'five': [],
+    'six': [],
+    'eight': [],
+    'nine': [],
+    'ten': [],
+    # Hundred
+    'hundred': [],
+    # Thousand
+    'thousand' : [],
+    # Lakhs
+    'lakh' : ['lac','lach','laq','lak'],
+    }
+    words = sentence.split()  # Split the sentence by spaces
+    # Replace words using the mapping
+    for i, word in enumerate(words):
+        for replacement, patterns in replacement_map.items():
+            if word in patterns:
+                words[i] = replacement  # Replace the word if it's fully matched
+    # Join the processed words back into a sentence
+    return ' '.join(words)
+# In[ ]:

text2int.py ADDED Viewed

	@@ -0,0 +1,199 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[1]:
+# # # Function to convert Hindi text to numerical representation
+# from isNumber import is_number
+# def text_to_int (textnum, numwords={}):
+#     units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
+#             'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
+#             'sixteen', 'seventeen', 'eighteen', 'nineteen']
+#     tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
+#     scales = ['hundred', 'thousand', 'lac','million', 'billion', 'trillion']
+#     ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
+#     ordinal_endings = [('ieth', 'y'), ('th', '')]
+#     if not numwords:
+#         numwords['and'] = (1, 0)
+#         for idx, word in enumerate(units): numwords[word] = (1, idx)
+#         for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
+#         for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
+#     textnum = textnum.replace('-', ' ')
+#     current = result = 0
+#     curstring = ''
+#     onnumber = False
+#     lastunit = False
+#     lastscale = False
+#     def is_numword(x):
+#         if is_number(x):
+#             return True
+#         if word in numwords:
+#             return True
+#         return False
+#     def from_numword(x):
+#         if is_number(x):
+#             scale = 0
+#             increment = int(x.replace(',', ''))
+#             return scale, increment
+#         return numwords[x]
+#     for word in textnum.split():
+#         if word in ordinal_words:
+#             scale, increment = (1, ordinal_words[word])
+#             current = current * scale + increment
+#             if scale > 100:
+#                 result += current
+#                 current = 0
+#             onnumber = True
+#             lastunit = False
+#             lastscale = False
+#         else:
+#             for ending, replacement in ordinal_endings:
+#                 if word.endswith(ending):
+#                     word = "%s%s" % (word[:-len(ending)], replacement)
+#             if (not is_numword(word)) or (word == 'and' and not lastscale):
+#                 if onnumber:
+#                     # Flush the current number we are building
+#                     curstring += repr(result + current) + " "
+#                 curstring += word + " "
+#                 result = current = 0
+#                 onnumber = False
+#                 lastunit = False
+#                 lastscale = False
+#             else:
+#                 scale, increment = from_numword(word)
+#                 onnumber = True
+#                 if lastunit and (word not in scales):
+#                     # Assume this is part of a string of individual numbers to
+#                     # be flushed, such as a zipcode "one two three four five"
+#                     curstring += repr(result + current)
+#                     result = current = 0
+#                 if scale > 1:
+#                     current = max(1, current)
+#                 current = current * scale + increment
+#                 if scale > 100:
+#                     result += current
+#                     current = 0
+#                 lastscale = False
+#                 lastunit = False
+#                 if word in scales:
+#                     lastscale = True
+#                 elif word in units:
+#                     lastunit = True
+#     if onnumber:
+#         curstring += repr(result + current)
+#     return curstring
+# In[5]:
+import nbimporter
+from isNumber import is_number  # Remove or replace this if unnecessary
+def text_to_int(textnum, numwords={}):
+    # Define units, tens, and scales including "lac"
+    units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
+            'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
+            'sixteen', 'seventeen', 'eighteen', 'nineteen']
+    tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
+    scales = ['hundred', 'thousand', 'lakh', 'million', 'billion', 'trillion']
+    ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}
+    ordinal_endings = [('ieth', 'y'), ('th', '')]
+    if not numwords:
+        numwords['and'] = (1, 0)  # Handle "one hundred and twenty"
+        # Add units, tens, and scales to numwords
+        for idx, word in enumerate(units):
+            numwords[word] = (1, idx)
+        for idx, word in enumerate(tens):
+            numwords[word] = (1, idx * 10)
+        for idx, word in enumerate(scales):
+            numwords[word] = (10 ** (5 if word == 'lakh' else idx * 3 or 2), 0)  # Handle "lac" as 10^5
+    # Remove hyphens and normalize input
+    textnum = textnum.replace('-', ' ')
+    current = result = 0
+    curstring = ''
+    onnumber = False
+    lastunit = False
+    lastscale = False
+    def is_numword(x):
+        return is_number(x) or x in numwords
+    def from_numword(x):
+        if is_number(x):
+            return 0, int(x.replace(',', ''))
+        return numwords[x]
+    for word in textnum.split():
+        if word in ordinal_words:
+            scale, increment = (1, ordinal_words[word])
+            current = current * scale + increment
+            if scale > 100:
+                result += current
+                current = 0
+            onnumber = True
+            lastunit = False
+            lastscale = False
+        else:
+            for ending, replacement in ordinal_endings:
+                if word.endswith(ending):
+                    word = f"{word[:-len(ending)]}{replacement}"
+            if not is_numword(word) or (word == 'and' and not lastscale):
+                if onnumber:
+                    curstring += repr(result + current) + " "
+                curstring += word + " "
+                result = current = 0
+                onnumber = False
+                lastunit = False
+                lastscale = False
+            else:
+                scale, increment = from_numword(word)
+                onnumber = True
+                if lastunit and word not in scales:
+                    curstring += repr(result + current) + " "
+                    result = current = 0
+                if scale > 1:
+                    current = max(1, current)
+                current = current * scale + increment
+                if scale >= 100:
+                    result += current
+                    current = 0
+                lastscale = word in scales
+                lastunit = word in units
+    if onnumber:
+        curstring += repr(result + current)
+    return curstring.strip()
+# In[ ]: