Spaces:

cdactvm
/

Hindi_ASR

Sleeping

App Files Files Community

cdactvm commited on Sep 27, 2024

Commit

45f9ca7

verified ·

1 Parent(s): dec7228

Upload 5 files

Browse files

Files changed (5) hide show

convert2list.py +96 -0
isNumber.py +16 -0
processDoubles.py +25 -0
replaceWords.py +159 -0
text2int.py +199 -0

convert2list.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[30]:
+# import nbimporter
+import nbimporter
+from Text2List import text_to_list
+def convert_to_list(text, text_list):
+    matched_words = []
+    unmatched_text = ''  # To accumulate unmatched characters
+    # Sort text_list by length in descending order to prioritize longest matches first
+    text_list_sorted = sorted(text_list, key=len, reverse=True)
+    while text:
+        matched = False
+        for word in text_list_sorted:
+            if text.startswith(word):
+                # Add any accumulated unmatched text before appending the matched word
+                if unmatched_text:
+                    matched_words.append(unmatched_text)
+                    unmatched_text = ''  # Reset unmatched text accumulator
+                matched_words.append(word)
+                text = text[len(word):]  # Remove the matched part from text
+                matched = True
+                break
+        if not matched:
+            # Accumulate unmatched characters
+            unmatched_text += text[0]
+            text = text[1:]
+    # If there's any remaining unmatched text, add it to the result
+    if unmatched_text:
+        matched_words.append(unmatched_text)
+    # Join matched words and unmatched text with a space
+    result = ' '.join(matched_words)
+    return result
+text = "जीरोएकदोतीनचारपांचछहसातआठनौदसजीरोएकदोतीनचारपांच"
+if __name__=="__main__":
+    converted=convert_to_list(text, text_to_list())
+    print(converted)
+# In[33]:
+# # import nbimporter
+# import nbimporter
+# from Text2List import text_to_list
+# def convert_to_list(text, text_list):
+#     matched_words = []
+#     unmatched_text = ''  # To accumulate unmatched characters
+#     # Sort text_list by length in descending order to prioritize longest matches first
+#     text_list_sorted = sorted(text_list, key=len, reverse=True)
+#     while text:
+#         matched = False
+#         for word in text_list_sorted:
+#             if word in text:
+#                 # Add any accumulated unmatched text before appending the matched word
+#                 if unmatched_text:
+#                     matched_words.append(unmatched_text)
+#                     unmatched_text = ''  # Reset unmatched text accumulator
+#                 matched_words.append(word)
+#                 text = text[len(word):]  # Remove the matched part from text
+#                 matched = True
+#                 break
+#         if not matched:
+#             # Accumulate unmatched characters
+#             unmatched_text += text[0]
+#             text = text[1:]
+#     # If there's any remaining unmatched text, add it to the result
+#     if unmatched_text:
+#         matched_words.append(unmatched_text)
+#     # Join matched words and unmatched text with a space
+#     result = ' '.join(matched_words)
+#     return result
+# text = "जीरोएकदोतीनचार"
+# if __name__=="__main__":
+#     converted=convert_to_list(text, text_to_list())
+#     print(converted)

isNumber.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[ ]:
+# Function to check if the string is a number
+def is_number(x):
+    if type(x) == str:
+        x = x.replace(',', '')
+    try:
+        float(x)
+    except:
+        return False
+    return True

processDoubles.py ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[ ]:
+# Function to process "double" followed by a number
+def process_doubles(sentence):
+    tokens = sentence.split()
+    result = []
+    i = 0
+    while i < len(tokens):
+        if tokens[i] == "डबल":
+            if i + 1 < len(tokens):
+                result.append(tokens[i + 1])
+                result.append(tokens[i + 1])
+                i += 2
+            else:
+                result.append(tokens[i])
+                i += 1
+        else:
+            result.append(tokens[i])
+            i += 1
+    return ' '.join(result)

replaceWords.py ADDED Viewed

	@@ -0,0 +1,159 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[7]:
+import re
+def replace_words(sentence):
+    # Define a dictionary mapping a single word to a list of words or phrases
+    replacement_map = {
+    # Multiples of ten
+    'twenty': ['ट्वेंटी', 'बीस'],
+    'thirty': ['थर्टी', 'तीस'],
+    'forty': ['फोर्टी', 'चालीस'],
+    'fifty': ['फिफ्टी', 'पचास'],
+    'sixty': ['सिक्स्टी', 'साठ'],
+    'seventy': ['सेवंटी', 'सत्तर','सेवनटी','सेवेनटी','सेवांटी'],
+    'eighty': ['एटी', 'अस्सी'],
+    'ninety': ['नाइंटी', 'नब्बे'],
+    # Numbers from 11 to 19
+    'eleven': ['इलेवन', 'ग्यारह','इगारा'],
+    'twelve': ['ट्वेल्व', 'बारह'],
+    'thirteen': ['थर्टीन', 'तेरह','तेरा'],
+    'fourteen': ['फोर्टीन', 'चौदह'],
+    'fifteen': ['फिफ्टीन', 'पंद्रह','पंद्रा'],
+    'sixteen': ['सिक्स्टीन', 'सोलह','सोल्ला'],
+    'seventeen': ['सेवंटीन', 'सत्रह''सतरा'],
+    'eighteen': ['एटीन', 'अठारह''अठारा'],
+    'nineteen': ['नाइनटीन', 'उन्नीस','उन्नईस','उनाइस'],
+    # Numbers from 21 to 29
+    'twenty one': ['ट्वेंटी वन', 'इक्कीस'],
+    'twenty two': ['ट्वेंटी टू', 'बाईस'],
+    'twenty three': ['ट्वेंटी थ्री', 'तेईस'],
+    'twenty four': ['ट्वेंटी फोर', 'चौबीस'],
+    'twenty five': ['ट्वेंटी फाइव', 'पच्चीस'],
+    'twenty six': ['ट्वेंटी सिक्स', 'छब्बीस'],
+    'twenty seven': ['ट्वेंटी सेवन', 'सत्ताईस','सताईस'],
+    'twenty eight': ['ट्वेंटी एट', 'अट्ठाईस','अठ्ठाइस','अठ्ठाईस'],
+    'twenty nine': ['ट्वेंटी नाइन', 'उनतीस'],
+    # Numbers from 31 to 39
+    'thirty one': ['थर्टी वन', 'इकतीस'],
+    'thirty two': ['थर्टी टू', 'बत्तीस'],
+    'thirty three': ['थर्टी थ्री', 'तेतीस'],
+    'thirty four': ['थर्टी फोर', 'चौंतीस'],
+    'thirty five': ['थर्टी फाइव', 'पैंतीस'],
+    'thirty six': ['थर्टी सिक्स', 'छत्तीस'],
+    'thirty seven': ['थर्टी सेवन', 'सैंतीस'],
+    'thirty eight': ['थर्टी एट', 'अड़तीस'],
+    'thirty nine': ['थर्टी नाइन', 'उनतालीस'],
+    # Numbers from 41 to 49
+    'forty one': ['फोर्टी वन', 'इकतालीस'],
+    'forty two': ['फोर्टी टू', 'बयालीस'],
+    'forty three': ['फोर्टी थ्री', 'तैंतालीस'],
+    'forty four': ['फोर्टी फोर', 'चौंतालीस'],
+    'forty five': ['फोर्टी फाइव', 'पैंतालीस'],
+    'forty six': ['फोर्टी सिक्स', 'छयालिस'],
+    'forty seven': ['फोर्टी सेवन', 'सैंतालीस'],
+    'forty eight': ['फोर्टी एट', 'अड़तालीस'],
+    'forty nine': ['फोर्टी नाइन', 'उनचास'],
+    # Numbers from 51 to 59
+    'fifty one': ['फिफ्टी वन', 'इक्यावन'],
+    'fifty two': ['फिफ्टी टू', 'बावन'],
+    'fifty three': ['फिफ्टी थ्री', 'तिरेपन','तिरपन','तीरपन'],
+    'fifty four': ['फिफ्टी फोर', 'चौवन'],
+    'fifty five': ['फिफ्टी फाइव', 'पचपन'],
+    'fifty six': ['फिफ्टी सिक्स', 'छप्पन','छपपन'],
+    'fifty seven': ['फिफ्टी सेवन', 'सत्तावन','संताबन','संतावन'],
+    'fifty eight': ['फिफ्टी एट', 'अट्ठावन','अंठावन'],
+    'fifty nine': ['फिफ्टी नाइन', 'उनसठ','उंसट','उंसठ'],
+    # Numbers from 61 to 69
+    'sixty one': ['सिक्स्टी वन', 'इकसठ'],
+    'sixty two': ['सिक्स्टी टू', 'बासठ'],
+    'sixty three': ['सिक्स्टी थ्री', 'तिरसठ'],
+    'sixty four': ['सिक्स्टी फोर', 'चौंसठ'],
+    'sixty five': ['सिक्स्टी फाइव', 'पैंसठ'],
+    'sixty six': ['सिक्स्टी सिक्स', 'छियासठ'],
+    'sixty seven': ['सिक्स्टी सेवन', 'सड़सठ'],
+    'sixty eight': ['सिक्स्टी एट', 'अड़सठ'],
+    'sixty nine': ['सिक्स्टी नाइन', 'उनहत्तर'],
+    # Numbers from 71 to 79
+    'seventy one': ['सेवंटी वन', 'इकहत्तर','इखत्तर','इकत्तर'],
+    'seventy two': ['सेवंटी टू', 'बहत्तर'],
+    'seventy three': ['सेवंटी थ्री', 'तिहत्तर','तियत्र','तियत्तर','तीहत्तर','तिहत्थर'],
+    'seventy four': ['सेवंटी फोर', 'चौहत्तर',],
+    'seventy five': ['सेवंटी फाइव', 'पचहत्तर','पछत्तर','पिछत्तर','पचहत्तर','पचत्तर'],
+    'seventy six': ['सेवंटी सिक्स', 'छिहत्तर','छीहत्तर'],
+    'seventy seven': ['सेवंटी सेवन', 'सतहत्तर','सतात्तर','सतत्तर','सतहत्थर'],
+    'seventy eight': ['सेवंटी एट', 'अठहत्तर','अठत्तर'],
+    'seventy nine': ['सेवंटी नाइन', 'उन्यासी','उनासी'],
+    # Numbers from 81 to 89
+    'eighty one': ['एटी वन', 'इक्यासी'],
+    'eighty two': ['एटी टू', 'बयासी'],
+    'eighty three': ['एटी थ्री', 'तिरासी'],
+    'eighty four': ['एटी फोर', 'चौरासी'],
+    'eighty five': ['एटी फाइव', 'पचासी','पिचासी'],
+    'eighty six': ['एटी सिक्स', 'छियासी'],
+    'eighty seven': ['एटी सेवन', 'सतासी'],
+    'eighty eight': ['एटी एट', 'अठासी'],
+    'eighty nine': ['एटी नाइन', 'नवासी'],
+    # Numbers from 91 to 99
+    'ninety one': ['नाइंटी वन', 'इक्यानवे'],
+    'ninety two': ['नाइंटी टू', 'बानवे','बानबे'],
+    'ninety three': ['नाइंटी थ्री', 'तिरानवे'],
+    'ninety four': ['नाइंटी फोर', 'चौरानवे'],
+    'ninety five': ['नाइंटी फाइव', 'पचानवे'],
+    'ninety six': ['नाइंटी सिक्स', 'छियानवे'],
+    'ninety seven': ['नाइंटी सेवन', 'सतानवे'],
+    'ninety eight': ['नाइंटी एट', 'अठानवे'],
+    'ninety nine': ['नाइंटी नाइन', 'निन्यानवे'],
+    # Numbers from one to ten
+    'seven': ['सेवन', 'सात'],
+    'zero': ['शून्य', 'जेरो', 'शुन्ना', 'जीरो'],
+    'one': ['वन', 'एंक', 'इक', 'एक'],
+    'two': ['टू', 'दो'],
+    'three': ['थ्री', 'तीना', 'तीन', 'त्री'],
+    'four': ['फोर','फ़ोर', 'फॉर', 'च्यार', 'चार'],
+    'five': ['फाइव', 'पाँच', 'पांच'],
+    'six': ['सिक्स', 'चह', 'छौ', 'छै', 'छह', 'छे'],
+    'eight': ['एट', 'अट', 'आठ'],
+    'nine': ['नाइन', 'नौ'],
+    'ten': ['टेन', 'दस'],
+    # Hundred
+    'hundred': ['हंड्रेड', 'सौ','सो','साव'],
+    # Thousand
+    'thousand' : ['हजार','थौजनड','थाउजंड','हज़ार'],
+    # Lakhs
+    'lac' : ['लाख'],
+    # Special for double digits
+    'डबल': ['दबल', 'डबल', 'दुबाल'],
+    }
+    words = sentence.split()  # Split the sentence by spaces
+    # Replace words using the mapping
+    for i, word in enumerate(words):
+        for replacement, patterns in replacement_map.items():
+            if word in patterns:
+                words[i] = replacement  # Replace the word if it's fully matched
+    # Join the processed words back into a sentence
+    return ' '.join(words)
+# In[ ]:

text2int.py ADDED Viewed

	@@ -0,0 +1,199 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[ ]:
+# # Function to convert Hindi text to numerical representation
+# from isNumber import is_number
+# def text_to_int (textnum, numwords={}):
+#     units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
+#             'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
+#             'sixteen', 'seventeen', 'eighteen', 'nineteen']
+#     tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
+#     scales = ['hundred', 'thousand', 'lac','million', 'billion', 'trillion']
+#     ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
+#     ordinal_endings = [('ieth', 'y'), ('th', '')]
+#     if not numwords:
+#         numwords['and'] = (1, 0)
+#         for idx, word in enumerate(units): numwords[word] = (1, idx)
+#         for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
+#         for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
+#     textnum = textnum.replace('-', ' ')
+#     current = result = 0
+#     curstring = ''
+#     onnumber = False
+#     lastunit = False
+#     lastscale = False
+#     def is_numword(x):
+#         if is_number(x):
+#             return True
+#         if word in numwords:
+#             return True
+#         return False
+#     def from_numword(x):
+#         if is_number(x):
+#             scale = 0
+#             increment = int(x.replace(',', ''))
+#             return scale, increment
+#         return numwords[x]
+#     for word in textnum.split():
+#         if word in ordinal_words:
+#             scale, increment = (1, ordinal_words[word])
+#             current = current * scale + increment
+#             if scale > 100:
+#                 result += current
+#                 current = 0
+#             onnumber = True
+#             lastunit = False
+#             lastscale = False
+#         else:
+#             for ending, replacement in ordinal_endings:
+#                 if word.endswith(ending):
+#                     word = "%s%s" % (word[:-len(ending)], replacement)
+#             if (not is_numword(word)) or (word == 'and' and not lastscale):
+#                 if onnumber:
+#                     # Flush the current number we are building
+#                     curstring += repr(result + current) + " "
+#                 curstring += word + " "
+#                 result = current = 0
+#                 onnumber = False
+#                 lastunit = False
+#                 lastscale = False
+#             else:
+#                 scale, increment = from_numword(word)
+#                 onnumber = True
+#                 if lastunit and (word not in scales):
+#                     # Assume this is part of a string of individual numbers to
+#                     # be flushed, such as a zipcode "one two three four five"
+#                     curstring += repr(result + current)
+#                     result = current = 0
+#                 if scale > 1:
+#                     current = max(1, current)
+#                 current = current * scale + increment
+#                 if scale > 100:
+#                     result += current
+#                     current = 0
+#                 lastscale = False
+#                 lastunit = False
+#                 if word in scales:
+#                     lastscale = True
+#                 elif word in units:
+#                     lastunit = True
+#     if onnumber:
+#         curstring += repr(result + current)
+#     return curstring
+# In[ ]:
+from isNumber import is_number  # Remove or replace this if unnecessary
+def text_to_int(textnum, numwords={}):
+    # Define units, tens, and scales including "lac"
+    units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
+            'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
+            'sixteen', 'seventeen', 'eighteen', 'nineteen']
+    tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
+    scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion']  # "lac" added
+    ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}
+    ordinal_endings = [('ieth', 'y'), ('th', '')]
+    if not numwords:
+        numwords['and'] = (1, 0)  # Handle "one hundred and twenty"
+        # Add units, tens, and scales to numwords
+        for idx, word in enumerate(units):
+            numwords[word] = (1, idx)
+        for idx, word in enumerate(tens):
+            numwords[word] = (1, idx * 10)
+        for idx, word in enumerate(scales):
+            numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0)  # Handle "lac" as 10^5
+    # Remove hyphens and normalize input
+    textnum = textnum.replace('-', ' ')
+    current = result = 0
+    curstring = ''
+    onnumber = False
+    lastunit = False
+    lastscale = False
+    def is_numword(x):
+        return is_number(x) or x in numwords
+    def from_numword(x):
+        if is_number(x):
+            return 0, int(x.replace(',', ''))
+        return numwords[x]
+    for word in textnum.split():
+        if word in ordinal_words:
+            scale, increment = (1, ordinal_words[word])
+            current = current * scale + increment
+            if scale > 100:
+                result += current
+                current = 0
+            onnumber = True
+            lastunit = False
+            lastscale = False
+        else:
+            for ending, replacement in ordinal_endings:
+                if word.endswith(ending):
+                    word = f"{word[:-len(ending)]}{replacement}"
+            if not is_numword(word) or (word == 'and' and not lastscale):
+                if onnumber:
+                    curstring += repr(result + current) + " "
+                curstring += word + " "
+                result = current = 0
+                onnumber = False
+                lastunit = False
+                lastscale = False
+            else:
+                scale, increment = from_numword(word)
+                onnumber = True
+                if lastunit and word not in scales:
+                    curstring += repr(result + current) + " "
+                    result = current = 0
+                if scale > 1:
+                    current = max(1, current)
+                current = current * scale + increment
+                if scale >= 100:
+                    result += current
+                    current = 0
+                lastscale = word in scales
+                lastunit = word in units
+    if onnumber:
+        curstring += repr(result + current)
+    return curstring.strip()
+# In[ ]: