Upload 9 files
Browse files- Text2List.ipynb +95 -0
- convert2list.ipynb +151 -0
- gradio_hindi_number_conversion.ipynb +126 -0
- isNumber.ipynb +43 -0
- main.ipynb +134 -0
- numberMapping.ipynb +162 -0
- processDoubles.ipynb +52 -0
- replaceWords.ipynb +188 -0
- text2int.ipynb +232 -0
Text2List.ipynb
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 4,
|
6 |
+
"id": "94c5b577-d632-422a-a82f-f357f36d491b",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"def text_to_list():\n",
|
11 |
+
" text_list = [ \n",
|
12 |
+
" # Hindi script for English numbers (11-19)\n",
|
13 |
+
" 'इलेवन', 'ट्वेल्व', 'थर्टीन', 'फोर्टीन', 'फिफ्टीन', 'सिक्स्टीन', 'सेवन्टीन', 'एटीन', 'नाइन्टीन', \n",
|
14 |
+
" # Hindi numbers (11-19)\n",
|
15 |
+
" 'ग्यारह', 'बारह', 'तेरह','तेरा ', 'चौदह', 'पंद्रह', 'सोलह','सोल्ला' 'सत्रह', 'सतरा', 'अठारा', 'उनाइस','अठारह', 'उन्नीस',\n",
|
16 |
+
" # Hindi script for English multiples of ten (20, 30, ..., 90)\n",
|
17 |
+
" 'ट्वेंटी', 'थर्टी', 'फोर्टी', 'फिफ्टी', 'सिक्स्टी', 'सेवेन्टी', 'सेवंटी', 'सत्तर','सेवनटी','सेवेनटी','सेवांटी','एटी', 'नाइंटी', \n",
|
18 |
+
" # Hindi multiples of ten (20, 30, ..., 90)\n",
|
19 |
+
" 'बीस', 'तीस', 'चालीस', 'पचास', 'साठ', 'सत्तर', 'अस्सी', 'नब्बे',\n",
|
20 |
+
" # Hindi script for English combinations of 21-29\n",
|
21 |
+
" 'ट्वेंटी वन', 'ट्वेंटी टू', 'ट्वेंटी थ्री', 'ट्वेंटी फोर', 'ट्वेंटी फाइव', 'ट्वेंटी सिक्स', 'ट्वेंटी सेवन', 'ट्वेंटी एट', 'ट्वेंटी नाइन', \n",
|
22 |
+
" # Hindi combinations of 21-29\n",
|
23 |
+
" 'इक्कीस', 'बाईस', 'तेईस', 'चौबीस', 'पच्चीस', 'छब्बीस', 'सत्ताईस', 'अट्ठाईस', 'उनतीस',\n",
|
24 |
+
" # Hindi script for English combinations of 31-39\n",
|
25 |
+
" 'थर्टी वन', 'थर्टी टू', 'थर्टी थ्री', 'थर्टी फोर', 'थर्टी फाइव', 'थर्टी सिक्स', 'थर्टी सेवन', 'थर्टी एट', 'थर्टी नाइन', \n",
|
26 |
+
" # Hindi combinations of 31-39\n",
|
27 |
+
" 'इकतीस', 'बत्तीस', 'तेतीस', 'चौंतीस', 'पैंतीस', 'छत्तीस', 'सैंतीस', 'अड़तीस', 'उनतालीस',\n",
|
28 |
+
" # Hindi script for English combinations of 41-49\n",
|
29 |
+
" 'फोर्टी वन', 'फोर्टी टू', 'फोर्टी थ्री', 'फोर्टी फोर', 'फोर्टी फाइव', 'फोर्टी सिक्स', 'फोर्टी सेवन', 'फोर्टी एट', 'फोर्टी नाइन', \n",
|
30 |
+
" # Hindi combinations of 41-49\n",
|
31 |
+
" 'इकतालीस', 'बयालीस', 'तैंतालीस', 'चौंतालीस', 'पैंतालीस', 'छयालिस', 'सैंतालीस', 'अड़तालीस', 'उनचास',\n",
|
32 |
+
" # Hindi script for English combinations of 51-59\n",
|
33 |
+
" 'फिफ्टी वन', 'फिफ्टी टू', 'फिफ्टी थ्री', 'फिफ्टी फोर', 'फिफ्टी फाइव', 'फिफ्टी सिक्स', 'फिफ्टी सेवन', 'फिफ्टी एट', 'फिफ्टी नाइन', \n",
|
34 |
+
" # Hindi combinations of 51-59\n",
|
35 |
+
" 'इक्यावन', 'बावन', 'तिरेपन', 'चौवन', 'पचपन', 'छप्पन', 'सत्तावन','संतावन', 'अट्ठावन', 'उनसठ','अंठावन','उंसट',\n",
|
36 |
+
" # Hindi script for English combinations of 61-69\n",
|
37 |
+
" 'सिक्स्टी वन', 'सिक्स्टी टू', 'सिक्स्टी थ्री', 'सिक्स्टी फोर', 'सिक्स्टी फाइव', 'सिक्स्टी सिक्स', 'सिक्स्टी सेवन', 'सिक्स्टी एट', 'सिक्स्टी नाइन', \n",
|
38 |
+
" # Hindi combinations of 61-69\n",
|
39 |
+
" 'इकसठ', 'बासठ', 'तिरसठ', 'चौंसठ', 'पैंसठ', 'छियासठ', 'सड़सठ', 'अड़सठ', 'उनहत्तर',\n",
|
40 |
+
" # Hindi script for English combinations of 71-79\n",
|
41 |
+
" 'सेवेन्टी वन', 'सेवेन्टी टू', 'सेवेन्टी थ्री', 'सेवेन्टी फोर', 'सेवेन्टी फाइव', 'सेवेन्टी सिक्स', 'सेवेन्टी सेवन', 'सेवेन्टी एट', 'सेवेन्टी नाइन', \n",
|
42 |
+
" # Hindi combinations of 71-79\n",
|
43 |
+
" 'इकहत्तर', 'बहत्तर', 'तिहत्तर', 'तियत्तर','तीहत्तर','पचत्तर', 'चिहत्तर', 'अटत्तर', 'उनासी' 'चौहत्तर', 'पचहत्तर', 'छिहत्तर', 'सतहत्तर', 'अठहत्तर', 'उन्यासी','उनासी','अठत्तर',\n",
|
44 |
+
" # Hindi script for English combinations of 81-89\n",
|
45 |
+
" 'एटी वन', 'एटी टू', 'एटी थ्री', 'एटी फोर', 'एटी फाइव', 'एटी सिक्स', 'एटी सेवन', 'एटी एट', 'एटी नाइन', \n",
|
46 |
+
" # Hindi combinations of 81-89\n",
|
47 |
+
" 'इक्यासी', 'बयासी', 'तिरासी', 'चौरासी', 'पचासी', 'छियासी', 'सतासी', 'अठासी', 'नवासी',\n",
|
48 |
+
" # Hindi script for English combinations of 91-99\n",
|
49 |
+
" 'नाइंटी वन', 'नाइंटी टू', 'नाइंटी थ्री', 'नाइंटी फोर', 'नाइंटी फाइव', 'नाइंटी सिक्स', 'नाइंटी सेवन', 'नाइंटी एट', 'नाइंटी नाइन', \n",
|
50 |
+
" # Hindi combinations of 91-99\n",
|
51 |
+
" 'इक्यानवे', 'बानवे', 'तिरानवे', 'चौरानवे', 'पचानवे', 'छियानवे', 'सतानवे', 'अठानवे', 'निन्यानवे',\n",
|
52 |
+
" # Hindi script for English numbers (0-10)\n",
|
53 |
+
" 'ज़ीरो', 'वन', 'टू', 'थ्री', 'फोर', 'फाइव', 'सिक्स', 'सेवन', 'एट', 'नाइन', 'टेन', \n",
|
54 |
+
" # Hindi numbers (0-10)\n",
|
55 |
+
" 'जीरो', 'एक', 'दो', 'तीन', 'चार', 'पांच', 'छह', 'सात', 'आठ', 'नौ', 'दस', \n",
|
56 |
+
" # Hindi script for 100\n",
|
57 |
+
" 'हंड्रेड',\n",
|
58 |
+
" # Hindi for 100\n",
|
59 |
+
" 'सौ',\n",
|
60 |
+
"]\n",
|
61 |
+
"\n",
|
62 |
+
" return text_list"
|
63 |
+
]
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"cell_type": "code",
|
67 |
+
"execution_count": null,
|
68 |
+
"id": "787fa243-0dba-4a35-b479-2e9198dc6af1",
|
69 |
+
"metadata": {},
|
70 |
+
"outputs": [],
|
71 |
+
"source": []
|
72 |
+
}
|
73 |
+
],
|
74 |
+
"metadata": {
|
75 |
+
"kernelspec": {
|
76 |
+
"display_name": "Python 3 (ipykernel)",
|
77 |
+
"language": "python",
|
78 |
+
"name": "python3"
|
79 |
+
},
|
80 |
+
"language_info": {
|
81 |
+
"codemirror_mode": {
|
82 |
+
"name": "ipython",
|
83 |
+
"version": 3
|
84 |
+
},
|
85 |
+
"file_extension": ".py",
|
86 |
+
"mimetype": "text/x-python",
|
87 |
+
"name": "python",
|
88 |
+
"nbconvert_exporter": "python",
|
89 |
+
"pygments_lexer": "ipython3",
|
90 |
+
"version": "3.11.7"
|
91 |
+
}
|
92 |
+
},
|
93 |
+
"nbformat": 4,
|
94 |
+
"nbformat_minor": 5
|
95 |
+
}
|
convert2list.ipynb
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 30,
|
6 |
+
"id": "b52e9a66-a8e9-4f56-91fd-8564b5b636fc",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [
|
9 |
+
{
|
10 |
+
"name": "stdout",
|
11 |
+
"output_type": "stream",
|
12 |
+
"text": [
|
13 |
+
"जीरो एक दो तीन चार पांच छह सात आठ नौ दस जीरो एक दो तीन चार पांच\n"
|
14 |
+
]
|
15 |
+
}
|
16 |
+
],
|
17 |
+
"source": [
|
18 |
+
"# import nbimporter\n",
|
19 |
+
"import nbimporter\n",
|
20 |
+
"from Text2List import text_to_list\n",
|
21 |
+
"def convert_to_list(text, text_list):\n",
|
22 |
+
" matched_words = []\n",
|
23 |
+
" unmatched_text = '' # To accumulate unmatched characters\n",
|
24 |
+
"\n",
|
25 |
+
" # Sort text_list by length in descending order to prioritize longest matches first\n",
|
26 |
+
" text_list_sorted = sorted(text_list, key=len, reverse=True)\n",
|
27 |
+
"\n",
|
28 |
+
" while text:\n",
|
29 |
+
" matched = False\n",
|
30 |
+
" for word in text_list_sorted:\n",
|
31 |
+
" if text.startswith(word):\n",
|
32 |
+
" # Add any accumulated unmatched text before appending the matched word\n",
|
33 |
+
" if unmatched_text:\n",
|
34 |
+
" matched_words.append(unmatched_text)\n",
|
35 |
+
" unmatched_text = '' # Reset unmatched text accumulator\n",
|
36 |
+
"\n",
|
37 |
+
" matched_words.append(word)\n",
|
38 |
+
" text = text[len(word):] # Remove the matched part from text\n",
|
39 |
+
" matched = True\n",
|
40 |
+
" break\n",
|
41 |
+
"\n",
|
42 |
+
" if not matched:\n",
|
43 |
+
" # Accumulate unmatched characters\n",
|
44 |
+
" unmatched_text += text[0]\n",
|
45 |
+
" text = text[1:]\n",
|
46 |
+
"\n",
|
47 |
+
" # If there's any remaining unmatched text, add it to the result\n",
|
48 |
+
" if unmatched_text:\n",
|
49 |
+
" matched_words.append(unmatched_text)\n",
|
50 |
+
"\n",
|
51 |
+
" # Join matched words and unmatched text with a space\n",
|
52 |
+
" result = ' '.join(matched_words)\n",
|
53 |
+
" return result\n",
|
54 |
+
" \n",
|
55 |
+
"text = \"जीरोएकदोतीनचारपांचछहसातआठनौदसजीरोएकदोतीनचारपांच\"\n",
|
56 |
+
"\n",
|
57 |
+
"if __name__==\"__main__\":\n",
|
58 |
+
" converted=convert_to_list(text, text_to_list())\n",
|
59 |
+
" print(converted)"
|
60 |
+
]
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"cell_type": "code",
|
64 |
+
"execution_count": 33,
|
65 |
+
"id": "f6655a7c-7481-4a73-a2e6-5327f589bb8b",
|
66 |
+
"metadata": {},
|
67 |
+
"outputs": [
|
68 |
+
{
|
69 |
+
"name": "stdout",
|
70 |
+
"output_type": "stream",
|
71 |
+
"text": [
|
72 |
+
"जीरो तीन तीन चार र\n"
|
73 |
+
]
|
74 |
+
}
|
75 |
+
],
|
76 |
+
"source": [
|
77 |
+
"# # import nbimporter\n",
|
78 |
+
"# import nbimporter\n",
|
79 |
+
"# from Text2List import text_to_list\n",
|
80 |
+
"# def convert_to_list(text, text_list):\n",
|
81 |
+
"# matched_words = []\n",
|
82 |
+
"# unmatched_text = '' # To accumulate unmatched characters\n",
|
83 |
+
"\n",
|
84 |
+
"# # Sort text_list by length in descending order to prioritize longest matches first\n",
|
85 |
+
"# text_list_sorted = sorted(text_list, key=len, reverse=True)\n",
|
86 |
+
"\n",
|
87 |
+
"# while text:\n",
|
88 |
+
"# matched = False\n",
|
89 |
+
"# for word in text_list_sorted:\n",
|
90 |
+
"# if word in text:\n",
|
91 |
+
"# # Add any accumulated unmatched text before appending the matched word\n",
|
92 |
+
"# if unmatched_text:\n",
|
93 |
+
"# matched_words.append(unmatched_text)\n",
|
94 |
+
"# unmatched_text = '' # Reset unmatched text accumulator\n",
|
95 |
+
"\n",
|
96 |
+
"# matched_words.append(word)\n",
|
97 |
+
"# text = text[len(word):] # Remove the matched part from text\n",
|
98 |
+
"# matched = True\n",
|
99 |
+
"# break\n",
|
100 |
+
"\n",
|
101 |
+
"# if not matched:\n",
|
102 |
+
"# # Accumulate unmatched characters\n",
|
103 |
+
"# unmatched_text += text[0]\n",
|
104 |
+
"# text = text[1:]\n",
|
105 |
+
"\n",
|
106 |
+
"# # If there's any remaining unmatched text, add it to the result\n",
|
107 |
+
"# if unmatched_text:\n",
|
108 |
+
"# matched_words.append(unmatched_text)\n",
|
109 |
+
"\n",
|
110 |
+
"# # Join matched words and unmatched text with a space\n",
|
111 |
+
"# result = ' '.join(matched_words)\n",
|
112 |
+
"# return result\n",
|
113 |
+
" \n",
|
114 |
+
"# text = \"जीरोएकदोतीनचार\"\n",
|
115 |
+
"\n",
|
116 |
+
"# if __name__==\"__main__\":\n",
|
117 |
+
"# converted=convert_to_list(text, text_to_list())\n",
|
118 |
+
"# print(converted)"
|
119 |
+
]
|
120 |
+
},
|
121 |
+
{
|
122 |
+
"cell_type": "code",
|
123 |
+
"execution_count": null,
|
124 |
+
"id": "26b725cd-d14f-4d8a-9829-99a7b9a5eeb3",
|
125 |
+
"metadata": {},
|
126 |
+
"outputs": [],
|
127 |
+
"source": []
|
128 |
+
}
|
129 |
+
],
|
130 |
+
"metadata": {
|
131 |
+
"kernelspec": {
|
132 |
+
"display_name": "Python 3 (ipykernel)",
|
133 |
+
"language": "python",
|
134 |
+
"name": "python3"
|
135 |
+
},
|
136 |
+
"language_info": {
|
137 |
+
"codemirror_mode": {
|
138 |
+
"name": "ipython",
|
139 |
+
"version": 3
|
140 |
+
},
|
141 |
+
"file_extension": ".py",
|
142 |
+
"mimetype": "text/x-python",
|
143 |
+
"name": "python",
|
144 |
+
"nbconvert_exporter": "python",
|
145 |
+
"pygments_lexer": "ipython3",
|
146 |
+
"version": "3.11.7"
|
147 |
+
}
|
148 |
+
},
|
149 |
+
"nbformat": 4,
|
150 |
+
"nbformat_minor": 5
|
151 |
+
}
|
gradio_hindi_number_conversion.ipynb
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "bb78ac37-de4f-407a-8fd5-1a269fd937c9",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [
|
9 |
+
{
|
10 |
+
"name": "stderr",
|
11 |
+
"output_type": "stream",
|
12 |
+
"text": [
|
13 |
+
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
|
14 |
+
]
|
15 |
+
}
|
16 |
+
],
|
17 |
+
"source": [
|
18 |
+
"# Import necessary libraries and filter warnings\n",
|
19 |
+
"import warnings\n",
|
20 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
21 |
+
"import nbimporter\n",
|
22 |
+
"import os\n",
|
23 |
+
"import re\n",
|
24 |
+
"import numpy as np\n",
|
25 |
+
"import torchaudio\n",
|
26 |
+
"from transformers import pipeline\n",
|
27 |
+
"from text2int import text_to_int\n",
|
28 |
+
"from isNumber import is_number\n",
|
29 |
+
"from Text2List import text_to_list\n",
|
30 |
+
"from convert2list import convert_to_list\n",
|
31 |
+
"from processDoubles import process_doubles\n",
|
32 |
+
"from replaceWords import replace_words\n",
|
33 |
+
"transcriber = pipeline(task=\"automatic-speech-recognition\", model=\"cdactvm/w2v-bert-2.0-hindi_v1\")"
|
34 |
+
]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"cell_type": "code",
|
38 |
+
"execution_count": 2,
|
39 |
+
"id": "02b787e8-6d08-4351-a830-7f7cae7f8243",
|
40 |
+
"metadata": {},
|
41 |
+
"outputs": [
|
42 |
+
{
|
43 |
+
"name": "stdout",
|
44 |
+
"output_type": "stream",
|
45 |
+
"text": [
|
46 |
+
"Running on local URL: http://127.0.0.1:7860\n",
|
47 |
+
"\n",
|
48 |
+
"To create a public link, set `share=True` in `launch()`.\n"
|
49 |
+
]
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"data": {
|
53 |
+
"text/html": [
|
54 |
+
"<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
55 |
+
],
|
56 |
+
"text/plain": [
|
57 |
+
"<IPython.core.display.HTML object>"
|
58 |
+
]
|
59 |
+
},
|
60 |
+
"metadata": {},
|
61 |
+
"output_type": "display_data"
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"data": {
|
65 |
+
"text/plain": []
|
66 |
+
},
|
67 |
+
"execution_count": 2,
|
68 |
+
"metadata": {},
|
69 |
+
"output_type": "execute_result"
|
70 |
+
}
|
71 |
+
],
|
72 |
+
"source": [
|
73 |
+
"import gradio as gr\n",
|
74 |
+
"\n",
|
75 |
+
"def transcribe(audio):\n",
|
76 |
+
" # # Process the audio file\n",
|
77 |
+
" transcript = transcriber(audio)\n",
|
78 |
+
" text_value = transcript['text']\n",
|
79 |
+
" print(text_value)\n",
|
80 |
+
" processd_doubles=process_doubles(text_value)\n",
|
81 |
+
" converted_to_list=convert_to_list(processd_doubles,text_to_list())\n",
|
82 |
+
" replaced_words = replace_words(converted_to_list)\n",
|
83 |
+
" converted_text=text_to_int(replaced_words)\n",
|
84 |
+
" return converted_text\n",
|
85 |
+
"\n",
|
86 |
+
"\n",
|
87 |
+
"demo = gr.Interface(\n",
|
88 |
+
" transcribe,\n",
|
89 |
+
" gr.Audio(sources=\"microphone\", type=\"filepath\"),\n",
|
90 |
+
" \"text\",\n",
|
91 |
+
")\n",
|
92 |
+
"\n",
|
93 |
+
"demo.launch()"
|
94 |
+
]
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"cell_type": "code",
|
98 |
+
"execution_count": null,
|
99 |
+
"id": "756c0b55-17b4-4aa0-baac-d8f1c4b003df",
|
100 |
+
"metadata": {},
|
101 |
+
"outputs": [],
|
102 |
+
"source": []
|
103 |
+
}
|
104 |
+
],
|
105 |
+
"metadata": {
|
106 |
+
"kernelspec": {
|
107 |
+
"display_name": "Python 3 (ipykernel)",
|
108 |
+
"language": "python",
|
109 |
+
"name": "python3"
|
110 |
+
},
|
111 |
+
"language_info": {
|
112 |
+
"codemirror_mode": {
|
113 |
+
"name": "ipython",
|
114 |
+
"version": 3
|
115 |
+
},
|
116 |
+
"file_extension": ".py",
|
117 |
+
"mimetype": "text/x-python",
|
118 |
+
"name": "python",
|
119 |
+
"nbconvert_exporter": "python",
|
120 |
+
"pygments_lexer": "ipython3",
|
121 |
+
"version": "3.11.7"
|
122 |
+
}
|
123 |
+
},
|
124 |
+
"nbformat": 4,
|
125 |
+
"nbformat_minor": 5
|
126 |
+
}
|
isNumber.ipynb
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "ac442c1a-b404-4936-afec-7e4eb43bb68b",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"# Function to check if the string is a number\n",
|
11 |
+
"def is_number(x):\n",
|
12 |
+
" if type(x) == str:\n",
|
13 |
+
" x = x.replace(',', '')\n",
|
14 |
+
" try:\n",
|
15 |
+
" float(x)\n",
|
16 |
+
" except:\n",
|
17 |
+
" return False\n",
|
18 |
+
" return True"
|
19 |
+
]
|
20 |
+
}
|
21 |
+
],
|
22 |
+
"metadata": {
|
23 |
+
"kernelspec": {
|
24 |
+
"display_name": "Python 3 (ipykernel)",
|
25 |
+
"language": "python",
|
26 |
+
"name": "python3"
|
27 |
+
},
|
28 |
+
"language_info": {
|
29 |
+
"codemirror_mode": {
|
30 |
+
"name": "ipython",
|
31 |
+
"version": 3
|
32 |
+
},
|
33 |
+
"file_extension": ".py",
|
34 |
+
"mimetype": "text/x-python",
|
35 |
+
"name": "python",
|
36 |
+
"nbconvert_exporter": "python",
|
37 |
+
"pygments_lexer": "ipython3",
|
38 |
+
"version": "3.11.7"
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"nbformat": 4,
|
42 |
+
"nbformat_minor": 5
|
43 |
+
}
|
main.ipynb
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "54bbfa73-b27d-44a0-895c-81c4d7b3ed7e",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [
|
9 |
+
{
|
10 |
+
"name": "stdout",
|
11 |
+
"output_type": "stream",
|
12 |
+
"text": [
|
13 |
+
"Requirement already satisfied: nbimporter in c:\\users\\wchl\\anaconda3\\envs\\speech_analysis\\lib\\site-packages (0.3.4)\n"
|
14 |
+
]
|
15 |
+
}
|
16 |
+
],
|
17 |
+
"source": [
|
18 |
+
"!pip install nbimporter"
|
19 |
+
]
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"cell_type": "code",
|
23 |
+
"execution_count": 2,
|
24 |
+
"id": "79a3e8c9-559a-4356-9c39-c4abb09ca60d",
|
25 |
+
"metadata": {},
|
26 |
+
"outputs": [
|
27 |
+
{
|
28 |
+
"ename": "RuntimeError",
|
29 |
+
"evalue": "Failed to import transformers.pipelines because of the following error (look up to see its traceback):\nDescriptors cannot be created directly.\nIf this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.\nIf you cannot immediately regenerate your protos, some other possible workarounds are:\n 1. Downgrade the protobuf package to 3.20.x or lower.\n 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).\n\nMore information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates",
|
30 |
+
"output_type": "error",
|
31 |
+
"traceback": [
|
32 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
33 |
+
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
34 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\transformers\\utils\\import_utils.py:1390\u001b[0m, in \u001b[0;36m_LazyModule._get_module\u001b[1;34m(self, module_name)\u001b[0m\n\u001b[0;32m 1389\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 1390\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mimportlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mimport_module\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m.\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mmodule_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1391\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n",
|
35 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\importlib\\__init__.py:127\u001b[0m, in \u001b[0;36mimport_module\u001b[1;34m(name, package)\u001b[0m\n\u001b[0;32m 126\u001b[0m level \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m--> 127\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_bootstrap\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_gcd_import\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m[\u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpackage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m)\u001b[49m\n",
|
36 |
+
"File \u001b[1;32m<frozen importlib._bootstrap>:1030\u001b[0m, in \u001b[0;36m_gcd_import\u001b[1;34m(name, package, level)\u001b[0m\n",
|
37 |
+
"File \u001b[1;32m<frozen importlib._bootstrap>:1007\u001b[0m, in \u001b[0;36m_find_and_load\u001b[1;34m(name, import_)\u001b[0m\n",
|
38 |
+
"File \u001b[1;32m<frozen importlib._bootstrap>:986\u001b[0m, in \u001b[0;36m_find_and_load_unlocked\u001b[1;34m(name, import_)\u001b[0m\n",
|
39 |
+
"File \u001b[1;32m<frozen importlib._bootstrap>:680\u001b[0m, in \u001b[0;36m_load_unlocked\u001b[1;34m(spec)\u001b[0m\n",
|
40 |
+
"File \u001b[1;32m<frozen importlib._bootstrap_external>:850\u001b[0m, in \u001b[0;36mexec_module\u001b[1;34m(self, module)\u001b[0m\n",
|
41 |
+
"File \u001b[1;32m<frozen importlib._bootstrap>:228\u001b[0m, in \u001b[0;36m_call_with_frames_removed\u001b[1;34m(f, *args, **kwds)\u001b[0m\n",
|
42 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\transformers\\pipelines\\__init__.py:26\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfeature_extraction_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m PreTrainedFeatureExtractor\n\u001b[1;32m---> 26\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimage_processing_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseImageProcessor\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mauto\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfiguration_auto\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AutoConfig\n",
|
43 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\transformers\\image_processing_utils.py:28\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfeature_extraction_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BatchFeature \u001b[38;5;28;01mas\u001b[39;00m BaseBatchFeature\n\u001b[1;32m---> 28\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimage_transforms\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m center_crop, normalize, rescale\n\u001b[0;32m 29\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimage_utils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ChannelDimension\n",
|
44 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\transformers\\image_transforms.py:47\u001b[0m\n\u001b[0;32m 46\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_tf_available():\n\u001b[1;32m---> 47\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mtf\u001b[39;00m\n\u001b[0;32m 49\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_flax_available():\n",
|
45 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\tensorflow\\__init__.py:37\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01m_typing\u001b[39;00m\n\u001b[1;32m---> 37\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpython\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtools\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m module_util \u001b[38;5;28;01mas\u001b[39;00m _module_util\n\u001b[0;32m 38\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpython\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutil\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlazy_loader\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LazyLoader \u001b[38;5;28;01mas\u001b[39;00m _LazyLoader\n",
|
46 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\tensorflow\\python\\__init__.py:37\u001b[0m\n\u001b[0;32m 36\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpython\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pywrap_tensorflow \u001b[38;5;28;01mas\u001b[39;00m _pywrap_tensorflow\n\u001b[1;32m---> 37\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpython\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01meager\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m context\n\u001b[0;32m 39\u001b[0m \u001b[38;5;66;03m# pylint: enable=wildcard-import\u001b[39;00m\n\u001b[0;32m 40\u001b[0m \n\u001b[0;32m 41\u001b[0m \u001b[38;5;66;03m# Bring in subpackages.\u001b[39;00m\n",
|
47 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\tensorflow\\python\\eager\\context.py:29\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msix\u001b[39;00m\n\u001b[1;32m---> 29\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mframework\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m function_pb2\n\u001b[0;32m 30\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mprotobuf\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m config_pb2\n",
|
48 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\tensorflow\\core\\framework\\function_pb2.py:16\u001b[0m\n\u001b[0;32m 13\u001b[0m _sym_db \u001b[38;5;241m=\u001b[39m _symbol_database\u001b[38;5;241m.\u001b[39mDefault()\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mframework\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m attr_value_pb2 \u001b[38;5;28;01mas\u001b[39;00m tensorflow_dot_core_dot_framework_dot_attr__value__pb2\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mframework\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m node_def_pb2 \u001b[38;5;28;01mas\u001b[39;00m tensorflow_dot_core_dot_framework_dot_node__def__pb2\n",
|
49 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\tensorflow\\core\\framework\\attr_value_pb2.py:16\u001b[0m\n\u001b[0;32m 13\u001b[0m _sym_db \u001b[38;5;241m=\u001b[39m _symbol_database\u001b[38;5;241m.\u001b[39mDefault()\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mframework\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m tensor_pb2 \u001b[38;5;28;01mas\u001b[39;00m tensorflow_dot_core_dot_framework_dot_tensor__pb2\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mframework\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m tensor_shape_pb2 \u001b[38;5;28;01mas\u001b[39;00m tensorflow_dot_core_dot_framework_dot_tensor__shape__pb2\n",
|
50 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\tensorflow\\core\\framework\\tensor_pb2.py:16\u001b[0m\n\u001b[0;32m 13\u001b[0m _sym_db \u001b[38;5;241m=\u001b[39m _symbol_database\u001b[38;5;241m.\u001b[39mDefault()\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mframework\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m resource_handle_pb2 \u001b[38;5;28;01mas\u001b[39;00m tensorflow_dot_core_dot_framework_dot_resource__handle__pb2\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mframework\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m tensor_shape_pb2 \u001b[38;5;28;01mas\u001b[39;00m tensorflow_dot_core_dot_framework_dot_tensor__shape__pb2\n",
|
51 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\tensorflow\\core\\framework\\resource_handle_pb2.py:16\u001b[0m\n\u001b[0;32m 13\u001b[0m _sym_db \u001b[38;5;241m=\u001b[39m _symbol_database\u001b[38;5;241m.\u001b[39mDefault()\n\u001b[1;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mframework\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m tensor_shape_pb2 \u001b[38;5;28;01mas\u001b[39;00m tensorflow_dot_core_dot_framework_dot_tensor__shape__pb2\n\u001b[0;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtensorflow\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcore\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mframework\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m types_pb2 \u001b[38;5;28;01mas\u001b[39;00m tensorflow_dot_core_dot_framework_dot_types__pb2\n",
|
52 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\tensorflow\\core\\framework\\tensor_shape_pb2.py:36\u001b[0m\n\u001b[0;32m 18\u001b[0m DESCRIPTOR \u001b[38;5;241m=\u001b[39m _descriptor\u001b[38;5;241m.\u001b[39mFileDescriptor(\n\u001b[0;32m 19\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtensorflow/core/framework/tensor_shape.proto\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m 20\u001b[0m package\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtensorflow\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 23\u001b[0m serialized_pb\u001b[38;5;241m=\u001b[39m_b(\u001b[38;5;124m'\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m,tensorflow/core/framework/tensor_shape.proto\u001b[39m\u001b[38;5;130;01m\\x12\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mtensorflow\u001b[39m\u001b[38;5;130;01m\\\"\u001b[39;00m\u001b[38;5;124mz\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\x10\u001b[39;00m\u001b[38;5;124mTensorShapeProto\u001b[39m\u001b[38;5;130;01m\\x12\u001b[39;00m\u001b[38;5;124m-\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\x03\u001b[39;00m\u001b[38;5;130;01m\\x64\u001b[39;00m\u001b[38;5;124mim\u001b[39m\u001b[38;5;130;01m\\x18\u001b[39;00m\u001b[38;5;130;01m\\x02\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;130;01m\\x03\u001b[39;00m\u001b[38;5;124m(\u001b[39m\u001b[38;5;130;01m\\x0b\u001b[39;00m\u001b[38;5;130;01m\\x32\u001b[39;00m\u001b[38;5;124m .tensorflow.TensorShapeProto.Dim\u001b[39m\u001b[38;5;130;01m\\x12\u001b[39;00m\u001b[38;5;130;01m\\x14\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\x0c\u001b[39;00m\u001b[38;5;124munknown_rank\u001b[39m\u001b[38;5;130;01m\\x18\u001b[39;00m\u001b[38;5;130;01m\\x03\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;130;01m\\x01\u001b[39;00m\u001b[38;5;124m(\u001b[39m\u001b[38;5;130;01m\\x08\u001b[39;00m\u001b[38;5;130;01m\\x1a\u001b[39;00m\u001b[38;5;124m!\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\x03\u001b[39;00m\u001b[38;5;130;01m\\x44\u001b[39;00m\u001b[38;5;124mim\u001b[39m\u001b[38;5;130;01m\\x12\u001b[39;00m\u001b[38;5;130;01m\\x0c\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\x04\u001b[39;00m\u001b[38;5;124msize\u001b[39m\u001b[38;5;130;01m\\x18\u001b[39;00m\u001b[38;5;130;01m\\x01\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;130;01m\\x01\u001b[39;00m\u001b[38;5;124m(\u001b[39m\u001b[38;5;130;01m\\x03\u001b[39;00m\u001b[38;5;130;01m\\x12\u001b[39;00m\u001b[38;5;130;01m\\x0c\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\x04\u001b[39;00m\u001b[38;5;124mname\u001b[39m\u001b[38;5;130;01m\\x18\u001b[39;00m\u001b[38;5;130;01m\\x02\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;130;01m\\x01\u001b[39;00m\u001b[38;5;124m(\u001b[39m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124mB\u001b[39m\u001b[38;5;130;01m\\x87\u001b[39;00m\u001b[38;5;130;01m\\x01\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\x18\u001b[39;00m\u001b[38;5;124morg.tensorflow.frameworkB\u001b[39m\u001b[38;5;130;01m\\x11\u001b[39;00m\u001b[38;5;124mTensorShapeProtosP\u001b[39m\u001b[38;5;130;01m\\x01\u001b[39;00m\u001b[38;5;124mZSgithub.com/tensorflow/tensorflow/tensorflow/go/core/framework/tensor_shape_go_proto\u001b[39m\u001b[38;5;130;01m\\xf8\u001b[39;00m\u001b[38;5;130;01m\\x01\u001b[39;00m\u001b[38;5;130;01m\\x01\u001b[39;00m\u001b[38;5;130;01m\\x62\u001b[39;00m\u001b[38;5;130;01m\\x06\u001b[39;00m\u001b[38;5;124mproto3\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m 24\u001b[0m )\n\u001b[0;32m 29\u001b[0m _TENSORSHAPEPROTO_DIM \u001b[38;5;241m=\u001b[39m _descriptor\u001b[38;5;241m.\u001b[39mDescriptor(\n\u001b[0;32m 30\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDim\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m 31\u001b[0m full_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtensorflow.TensorShapeProto.Dim\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m 32\u001b[0m filename\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 33\u001b[0m file\u001b[38;5;241m=\u001b[39mDESCRIPTOR,\n\u001b[0;32m 34\u001b[0m containing_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 35\u001b[0m fields\u001b[38;5;241m=\u001b[39m[\n\u001b[1;32m---> 36\u001b[0m \u001b[43m_descriptor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mFieldDescriptor\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 37\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msize\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfull_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtensorflow.TensorShapeProto.Dim.size\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 38\u001b[0m \u001b[43m \u001b[49m\u001b[43mnumber\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mtype\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m3\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcpp_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 39\u001b[0m \u001b[43m \u001b[49m\u001b[43mhas_default_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdefault_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 40\u001b[0m \u001b[43m \u001b[49m\u001b[43mmessage_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43menum_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontaining_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 41\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_extension\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextension_scope\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 42\u001b[0m \u001b[43m \u001b[49m\u001b[43mserialized_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mDESCRIPTOR\u001b[49m\u001b[43m)\u001b[49m,\n\u001b[0;32m 43\u001b[0m _descriptor\u001b[38;5;241m.\u001b[39mFieldDescriptor(\n\u001b[0;32m 44\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m, full_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtensorflow.TensorShapeProto.Dim.name\u001b[39m\u001b[38;5;124m'\u001b[39m, index\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m,\n\u001b[0;32m 45\u001b[0m number\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m, \u001b[38;5;28mtype\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m9\u001b[39m, cpp_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m9\u001b[39m, label\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m,\n\u001b[0;32m 46\u001b[0m has_default_value\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, default_value\u001b[38;5;241m=\u001b[39m_b(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m),\n\u001b[0;32m 47\u001b[0m message_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, enum_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, containing_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 48\u001b[0m is_extension\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, extension_scope\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 49\u001b[0m serialized_options\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, file\u001b[38;5;241m=\u001b[39mDESCRIPTOR),\n\u001b[0;32m 50\u001b[0m ],\n\u001b[0;32m 51\u001b[0m extensions\u001b[38;5;241m=\u001b[39m[\n\u001b[0;32m 52\u001b[0m ],\n\u001b[0;32m 53\u001b[0m nested_types\u001b[38;5;241m=\u001b[39m[],\n\u001b[0;32m 54\u001b[0m enum_types\u001b[38;5;241m=\u001b[39m[\n\u001b[0;32m 55\u001b[0m ],\n\u001b[0;32m 56\u001b[0m serialized_options\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 57\u001b[0m is_extendable\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[0;32m 58\u001b[0m syntax\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mproto3\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m 59\u001b[0m extension_ranges\u001b[38;5;241m=\u001b[39m[],\n\u001b[0;32m 60\u001b[0m oneofs\u001b[38;5;241m=\u001b[39m[\n\u001b[0;32m 61\u001b[0m ],\n\u001b[0;32m 62\u001b[0m serialized_start\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m149\u001b[39m,\n\u001b[0;32m 63\u001b[0m serialized_end\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m182\u001b[39m,\n\u001b[0;32m 64\u001b[0m )\n\u001b[0;32m 66\u001b[0m _TENSORSHAPEPROTO \u001b[38;5;241m=\u001b[39m _descriptor\u001b[38;5;241m.\u001b[39mDescriptor(\n\u001b[0;32m 67\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTensorShapeProto\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m 68\u001b[0m full_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtensorflow.TensorShapeProto\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 100\u001b[0m serialized_end\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m182\u001b[39m,\n\u001b[0;32m 101\u001b[0m )\n",
|
53 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\google\\protobuf\\descriptor.py:621\u001b[0m, in \u001b[0;36mFieldDescriptor.__new__\u001b[1;34m(cls, name, full_name, index, number, type, cpp_type, label, default_value, message_type, enum_type, containing_type, is_extension, extension_scope, options, serialized_options, has_default_value, containing_oneof, json_name, file, create_key)\u001b[0m\n\u001b[0;32m 615\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__new__\u001b[39m(\u001b[38;5;28mcls\u001b[39m, name, full_name, index, number, \u001b[38;5;28mtype\u001b[39m, cpp_type, label,\n\u001b[0;32m 616\u001b[0m default_value, message_type, enum_type, containing_type,\n\u001b[0;32m 617\u001b[0m is_extension, extension_scope, options\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 618\u001b[0m serialized_options\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 619\u001b[0m has_default_value\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, containing_oneof\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, json_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 620\u001b[0m file\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, create_key\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m): \u001b[38;5;66;03m# pylint: disable=redefined-builtin\u001b[39;00m\n\u001b[1;32m--> 621\u001b[0m \u001b[43m_message\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mMessage\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_CheckCalledFromGeneratedFile\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_extension:\n",
|
54 |
+
"\u001b[1;31mTypeError\u001b[0m: Descriptors cannot be created directly.\nIf this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.\nIf you cannot immediately regenerate your protos, some other possible workarounds are:\n 1. Downgrade the protobuf package to 3.20.x or lower.\n 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).\n\nMore information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates",
|
55 |
+
"\nThe above exception was the direct cause of the following exception:\n",
|
56 |
+
"\u001b[1;31mRuntimeError\u001b[0m Traceback (most recent call last)",
|
57 |
+
"Cell \u001b[1;32mIn[2], line 8\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mre\u001b[39;00m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorchaudio\u001b[39;00m\n\u001b[1;32m----> 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtransformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m pipeline\n\u001b[0;32m 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtext2int\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m text_to_int\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01misNumber\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m is_number\n",
|
58 |
+
"File \u001b[1;32m<frozen importlib._bootstrap>:1055\u001b[0m, in \u001b[0;36m_handle_fromlist\u001b[1;34m(module, fromlist, import_, recursive)\u001b[0m\n",
|
59 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\transformers\\utils\\import_utils.py:1380\u001b[0m, in \u001b[0;36m_LazyModule.__getattr__\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 1378\u001b[0m value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_module(name)\n\u001b[0;32m 1379\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_class_to_module\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m-> 1380\u001b[0m module \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_module\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_class_to_module\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1381\u001b[0m value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(module, name)\n\u001b[0;32m 1382\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
|
60 |
+
"File \u001b[1;32m~\\anaconda3\\envs\\Speech_Analysis\\lib\\site-packages\\transformers\\utils\\import_utils.py:1392\u001b[0m, in \u001b[0;36m_LazyModule._get_module\u001b[1;34m(self, module_name)\u001b[0m\n\u001b[0;32m 1390\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m importlib\u001b[38;5;241m.\u001b[39mimport_module(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m module_name, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m)\n\u001b[0;32m 1391\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m-> 1392\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[0;32m 1393\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFailed to import \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodule_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m because of the following error (look up to see its\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1394\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m traceback):\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1395\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n",
|
61 |
+
"\u001b[1;31mRuntimeError\u001b[0m: Failed to import transformers.pipelines because of the following error (look up to see its traceback):\nDescriptors cannot be created directly.\nIf this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.\nIf you cannot immediately regenerate your protos, some other possible workarounds are:\n 1. Downgrade the protobuf package to 3.20.x or lower.\n 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).\n\nMore information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates"
|
62 |
+
]
|
63 |
+
}
|
64 |
+
],
|
65 |
+
"source": [
|
66 |
+
"# Import necessary libraries and filter warnings\n",
|
67 |
+
"import warnings\n",
|
68 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
69 |
+
"import nbimporter\n",
|
70 |
+
"import os\n",
|
71 |
+
"import re\n",
|
72 |
+
"import torchaudio\n",
|
73 |
+
"from transformers import pipeline\n",
|
74 |
+
"from text2int import text_to_int\n",
|
75 |
+
"from isNumber import is_number\n",
|
76 |
+
"from Text2List import text_to_list\n",
|
77 |
+
"from convert2list import convert_to_list\n",
|
78 |
+
"from processDoubles import process_doubles\n",
|
79 |
+
"from replaceWords import replace_words\n",
|
80 |
+
"pipe = pipeline(task=\"automatic-speech-recognition\", model=\"cdactvm/w2v-bert-2.0-hindi_v1\")\n"
|
81 |
+
]
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"cell_type": "code",
|
85 |
+
"execution_count": null,
|
86 |
+
"id": "6075c345-ee6a-4810-b02f-88d517d272e2",
|
87 |
+
"metadata": {},
|
88 |
+
"outputs": [],
|
89 |
+
"source": [
|
90 |
+
"# # Process the audio file\n",
|
91 |
+
"transcript = pipe(\"C:/Users/WCHL/Desktop/hindi_dataset/train/hindi_numbers_test/hindi7.mp3\")\n",
|
92 |
+
"text_value = transcript['text']\n",
|
93 |
+
"processd_doubles=process_doubles(text_value)\n",
|
94 |
+
"converted_to_list=convert_to_list(processd_doubles,text_to_list())\n",
|
95 |
+
"replaced_words = replace_words(converted_to_list)\n",
|
96 |
+
"converted_text=text_to_int(replaced_words)\n",
|
97 |
+
"print(f\"generated text : {text_value}\")\n",
|
98 |
+
"print(f\"processed doubles : {processd_doubles}\")\n",
|
99 |
+
"print(f\"converted to list : {converted_to_list}\")\n",
|
100 |
+
"print(f\"replaced words : {replaced_words}\")\n",
|
101 |
+
"print(f\"final text : {converted_text}\")\n"
|
102 |
+
]
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"cell_type": "code",
|
106 |
+
"execution_count": null,
|
107 |
+
"id": "45040aca-7564-4b07-8e13-1fde7f4f8da2",
|
108 |
+
"metadata": {},
|
109 |
+
"outputs": [],
|
110 |
+
"source": []
|
111 |
+
}
|
112 |
+
],
|
113 |
+
"metadata": {
|
114 |
+
"kernelspec": {
|
115 |
+
"display_name": "Python 3 (ipykernel)",
|
116 |
+
"language": "python",
|
117 |
+
"name": "python3"
|
118 |
+
},
|
119 |
+
"language_info": {
|
120 |
+
"codemirror_mode": {
|
121 |
+
"name": "ipython",
|
122 |
+
"version": 3
|
123 |
+
},
|
124 |
+
"file_extension": ".py",
|
125 |
+
"mimetype": "text/x-python",
|
126 |
+
"name": "python",
|
127 |
+
"nbconvert_exporter": "python",
|
128 |
+
"pygments_lexer": "ipython3",
|
129 |
+
"version": "3.9.18"
|
130 |
+
}
|
131 |
+
},
|
132 |
+
"nbformat": 4,
|
133 |
+
"nbformat_minor": 5
|
134 |
+
}
|
numberMapping.ipynb
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "4f170db4-bc35-4f00-9c10-25ae297beda5",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"replacement_map = {\n",
|
11 |
+
" 'zero': ['शून्य', 'जेरो', 'शुन्ना', 'जीरो'],\n",
|
12 |
+
" 'one': ['वन', 'एंक', 'इक', 'एक'],\n",
|
13 |
+
" 'two': ['टू', 'दौ', 'दो'],\n",
|
14 |
+
" 'three': ['थ्री', 'तीना', 'तीन', 'त्री'],\n",
|
15 |
+
" 'four': ['फोर', 'फॉर', 'च्यार', 'चार'],\n",
|
16 |
+
" 'five': ['फाइव', 'पाँच', 'पांच'],\n",
|
17 |
+
" 'six': ['सिक्स', 'चह', 'छौ', 'छै', 'छह'],\n",
|
18 |
+
" 'seven': ['सेवन', 'सात'],\n",
|
19 |
+
" 'eight': ['एट', 'अट', 'आठ'],\n",
|
20 |
+
" 'nine': ['नाइन', 'नौ'],\n",
|
21 |
+
" 'ten': ['टेन', 'दस'],\n",
|
22 |
+
" \n",
|
23 |
+
" # Numbers from 11 to 19\n",
|
24 |
+
" 'eleven': ['इलेवन', 'ग्यारह'],\n",
|
25 |
+
" 'twelve': ['ट्वेल्व', 'बारह'],\n",
|
26 |
+
" 'thirteen': ['थर्टीन', 'तेरह'],\n",
|
27 |
+
" 'fourteen': ['फोर्टीन', 'चौदह'],\n",
|
28 |
+
" 'fifteen': ['फिफ्टीन', 'पंद्रह'],\n",
|
29 |
+
" 'sixteen': ['सिक्स्टीन', 'सोलह'],\n",
|
30 |
+
" 'seventeen': ['सेवंटीन', 'सत्रह'],\n",
|
31 |
+
" 'eighteen': ['एटीन', 'अठारह'],\n",
|
32 |
+
" 'nineteen': ['नाइनटीन', 'उन्नीस'],\n",
|
33 |
+
"\n",
|
34 |
+
" # Multiples of ten\n",
|
35 |
+
" 'twenty': ['ट्वेंटी', 'बीस'],\n",
|
36 |
+
" 'thirty': ['थर्टी', 'तीस'],\n",
|
37 |
+
" 'forty': ['फोर्टी', 'चालीस'],\n",
|
38 |
+
" 'fifty': ['फिफ्टी', 'पचास'],\n",
|
39 |
+
" 'sixty': ['सिक्स्टी', 'साठ'],\n",
|
40 |
+
" 'seventy': ['सेवंटी', 'सत्तर'],\n",
|
41 |
+
" 'eighty': ['एटी', 'अस्सी'],\n",
|
42 |
+
" 'ninety': ['नाइंटी', 'नब्बे'],\n",
|
43 |
+
"\n",
|
44 |
+
" # Numbers from 21 to 29\n",
|
45 |
+
" 'twenty one': ['ट्वेंटी वन', 'इक्कीस'],\n",
|
46 |
+
" 'twenty two': ['ट्वेंटी टू', 'बाईस'],\n",
|
47 |
+
" 'twenty three': ['ट्वेंटी थ्री', 'तेईस'],\n",
|
48 |
+
" 'twenty four': ['ट्वेंटी फोर', 'चौबीस'],\n",
|
49 |
+
" 'twenty five': ['ट्वेंटी फाइव', 'पच्चीस'],\n",
|
50 |
+
" 'twenty six': ['ट्वेंटी सिक्स', 'छब्बीस'],\n",
|
51 |
+
" 'twenty seven': ['ट्वेंटी सेवन', 'सत्ताईस'],\n",
|
52 |
+
" 'twenty eight': ['ट्वेंटी एट', 'अट्ठाईस'],\n",
|
53 |
+
" 'twenty nine': ['ट्वेंटी नाइन', 'उनतीस'],\n",
|
54 |
+
"\n",
|
55 |
+
" # Numbers from 31 to 39\n",
|
56 |
+
" 'thirty one': ['थर्टी वन', 'इकतीस'],\n",
|
57 |
+
" 'thirty two': ['थर्टी टू', 'बत्तीस'],\n",
|
58 |
+
" 'thirty three': ['थर्टी थ्री', 'तेतीस'],\n",
|
59 |
+
" 'thirty four': ['थर्टी फोर', 'चौंतीस'],\n",
|
60 |
+
" 'thirty five': ['थर्टी फाइव', 'पैंतीस'],\n",
|
61 |
+
" 'thirty six': ['थर्टी सिक्स', 'छत्तीस'],\n",
|
62 |
+
" 'thirty seven': ['थर्टी सेवन', 'सैंतीस'],\n",
|
63 |
+
" 'thirty eight': ['थर्टी एट', 'अड़तीस'],\n",
|
64 |
+
" 'thirty nine': ['थर्टी नाइन', 'उनतालीस'],\n",
|
65 |
+
"\n",
|
66 |
+
" # Numbers from 41 to 49\n",
|
67 |
+
" 'forty one': ['फोर्टी वन', 'इकतालीस'],\n",
|
68 |
+
" 'forty two': ['फोर्टी टू', 'बयालीस'],\n",
|
69 |
+
" 'forty three': ['फोर्टी थ्री', 'तैंतालीस'],\n",
|
70 |
+
" 'forty four': ['फोर्टी फोर', 'चौंतालीस'],\n",
|
71 |
+
" 'forty five': ['फोर्टी फाइव', 'पैंतालीस'],\n",
|
72 |
+
" 'forty six': ['फोर्टी सिक्स', 'छयालिस'],\n",
|
73 |
+
" 'forty seven': ['फोर्टी सेवन', 'सैंतालीस'],\n",
|
74 |
+
" 'forty eight': ['फोर्टी एट', 'अड़तालीस'],\n",
|
75 |
+
" 'forty nine': ['फोर्टी नाइन', 'उनचास'],\n",
|
76 |
+
"\n",
|
77 |
+
" # Numbers from 51 to 59\n",
|
78 |
+
" 'fifty one': ['फिफ्टी वन', 'इक्यावन'],\n",
|
79 |
+
" 'fifty two': ['फिफ्टी टू', 'बावन'],\n",
|
80 |
+
" 'fifty three': ['फिफ्टी थ्री', 'तिरेपन'],\n",
|
81 |
+
" 'fifty four': ['फिफ्टी फोर', 'चौवन'],\n",
|
82 |
+
" 'fifty five': ['फिफ्टी फाइव', 'पचपन'],\n",
|
83 |
+
" 'fifty six': ['फिफ्टी सिक्स', 'छप्पन'],\n",
|
84 |
+
" 'fifty seven': ['फिफ्टी सेवन', 'सत्तावन'],\n",
|
85 |
+
" 'fifty eight': ['फिफ्टी एट', 'अट्ठावन'],\n",
|
86 |
+
" 'fifty nine': ['फिफ्टी नाइन', 'उनसठ'],\n",
|
87 |
+
"\n",
|
88 |
+
" # Numbers from 61 to 69\n",
|
89 |
+
" 'sixty one': ['सिक्स्टी वन', 'इकसठ'],\n",
|
90 |
+
" 'sixty two': ['सिक्स्टी टू', 'बासठ'],\n",
|
91 |
+
" 'sixty three': ['सिक्स्टी थ्री', 'तिरसठ'],\n",
|
92 |
+
" 'sixty four': ['सिक्स्टी फोर', 'चौंसठ'],\n",
|
93 |
+
" 'sixty five': ['सिक्स्टी फाइव', 'पैंसठ'],\n",
|
94 |
+
" 'sixty six': ['सिक्स्टी सिक्स', 'छियासठ'],\n",
|
95 |
+
" 'sixty seven': ['सिक्स्टी सेवन', 'सड़सठ'],\n",
|
96 |
+
" 'sixty eight': ['सिक्स्टी एट', 'अड़सठ'],\n",
|
97 |
+
" 'sixty nine': ['सिक्स्टी नाइन', 'उनहत्तर'],\n",
|
98 |
+
"\n",
|
99 |
+
" # Numbers from 71 to 79\n",
|
100 |
+
" 'seventy one': ['सेवंटी वन', 'इकहत्तर'],\n",
|
101 |
+
" 'seventy two': ['सेवंटी टू', 'बहत्तर'],\n",
|
102 |
+
" 'seventy three': ['सेवंटी थ्री', 'तिहत्तर'],\n",
|
103 |
+
" 'seventy four': ['सेवंटी फोर', 'चौहत्तर'],\n",
|
104 |
+
" 'seventy five': ['सेवंटी फाइव', 'पचहत्तर'],\n",
|
105 |
+
" 'seventy six': ['सेवंटी सिक्स', 'छिहत्तर'],\n",
|
106 |
+
" 'seventy seven': ['सेवंटी सेवन', 'सतहत्तर'],\n",
|
107 |
+
" 'seventy eight': ['सेवंटी एट', 'अठहत्तर'],\n",
|
108 |
+
" 'seventy nine': ['सेवंटी नाइन', 'उन्यासी'],\n",
|
109 |
+
"\n",
|
110 |
+
" # Numbers from 81 to 89\n",
|
111 |
+
" 'eighty one': ['एटी वन', 'इक्यासी'],\n",
|
112 |
+
" 'eighty two': ['एटी टू', 'बयासी'],\n",
|
113 |
+
" 'eighty three': ['एटी थ्री', 'तिरासी'],\n",
|
114 |
+
" 'eighty four': ['एटी फोर', 'चौरासी'],\n",
|
115 |
+
" 'eighty five': ['एटी फाइव', 'पचासी'],\n",
|
116 |
+
" 'eighty six': ['एटी सिक्स', 'छियासी'],\n",
|
117 |
+
" 'eighty seven': ['एटी सेवन', 'सतासी'],\n",
|
118 |
+
" 'eighty eight': ['एटी एट', 'अठासी'],\n",
|
119 |
+
" 'eighty nine': ['एटी नाइन', 'नवासी'],\n",
|
120 |
+
"\n",
|
121 |
+
" # Numbers from 91 to 99\n",
|
122 |
+
" 'ninety one': ['नाइंटी वन', 'इक्यानवे'],\n",
|
123 |
+
" 'ninety two': ['नाइंटी टू', 'बानवे'],\n",
|
124 |
+
" 'ninety three': ['नाइंटी थ्री', 'तिरानवे'],\n",
|
125 |
+
" 'ninety four': ['नाइंटी फोर', 'चौरानवे'],\n",
|
126 |
+
" 'ninety five': ['नाइंटी फाइव', 'पचानवे'],\n",
|
127 |
+
" 'ninety six': ['नाइंटी सिक्स', 'छियानवे'],\n",
|
128 |
+
" 'ninety seven': ['नाइंटी सेवन', 'सतानवे'],\n",
|
129 |
+
" 'ninety eight': ['नाइंटी एट', 'अठानवे'],\n",
|
130 |
+
" 'ninety nine': ['नाइंटी नाइन', 'निन्यानवे'],\n",
|
131 |
+
"\n",
|
132 |
+
" # Hundred\n",
|
133 |
+
" 'hundred': ['हंड्रेड', 'सौ'],\n",
|
134 |
+
"\n",
|
135 |
+
" # Special for double digits\n",
|
136 |
+
" 'डबल': ['दबल', 'डबल', 'दुबाल'],\n",
|
137 |
+
"}\n"
|
138 |
+
]
|
139 |
+
}
|
140 |
+
],
|
141 |
+
"metadata": {
|
142 |
+
"kernelspec": {
|
143 |
+
"display_name": "Python 3 (ipykernel)",
|
144 |
+
"language": "python",
|
145 |
+
"name": "python3"
|
146 |
+
},
|
147 |
+
"language_info": {
|
148 |
+
"codemirror_mode": {
|
149 |
+
"name": "ipython",
|
150 |
+
"version": 3
|
151 |
+
},
|
152 |
+
"file_extension": ".py",
|
153 |
+
"mimetype": "text/x-python",
|
154 |
+
"name": "python",
|
155 |
+
"nbconvert_exporter": "python",
|
156 |
+
"pygments_lexer": "ipython3",
|
157 |
+
"version": "3.11.7"
|
158 |
+
}
|
159 |
+
},
|
160 |
+
"nbformat": 4,
|
161 |
+
"nbformat_minor": 5
|
162 |
+
}
|
processDoubles.ipynb
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "5912ba94-833f-4662-8b8c-f201b5dde892",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"# Function to process \"double\" followed by a number\n",
|
11 |
+
"def process_doubles(sentence):\n",
|
12 |
+
" tokens = sentence.split()\n",
|
13 |
+
" result = []\n",
|
14 |
+
" i = 0\n",
|
15 |
+
" while i < len(tokens):\n",
|
16 |
+
" if tokens[i] == \"डबल\":\n",
|
17 |
+
" if i + 1 < len(tokens):\n",
|
18 |
+
" result.append(tokens[i + 1])\n",
|
19 |
+
" result.append(tokens[i + 1])\n",
|
20 |
+
" i += 2\n",
|
21 |
+
" else:\n",
|
22 |
+
" result.append(tokens[i])\n",
|
23 |
+
" i += 1\n",
|
24 |
+
" else:\n",
|
25 |
+
" result.append(tokens[i])\n",
|
26 |
+
" i += 1\n",
|
27 |
+
" return ' '.join(result)\n"
|
28 |
+
]
|
29 |
+
}
|
30 |
+
],
|
31 |
+
"metadata": {
|
32 |
+
"kernelspec": {
|
33 |
+
"display_name": "Python 3 (ipykernel)",
|
34 |
+
"language": "python",
|
35 |
+
"name": "python3"
|
36 |
+
},
|
37 |
+
"language_info": {
|
38 |
+
"codemirror_mode": {
|
39 |
+
"name": "ipython",
|
40 |
+
"version": 3
|
41 |
+
},
|
42 |
+
"file_extension": ".py",
|
43 |
+
"mimetype": "text/x-python",
|
44 |
+
"name": "python",
|
45 |
+
"nbconvert_exporter": "python",
|
46 |
+
"pygments_lexer": "ipython3",
|
47 |
+
"version": "3.11.7"
|
48 |
+
}
|
49 |
+
},
|
50 |
+
"nbformat": 4,
|
51 |
+
"nbformat_minor": 5
|
52 |
+
}
|
replaceWords.ipynb
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 7,
|
6 |
+
"id": "19bbb494-3054-48ae-9b64-7f0756c0532d",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"import re\n",
|
11 |
+
"\n",
|
12 |
+
"def replace_words(sentence):\n",
|
13 |
+
" # Define a dictionary mapping a single word to a list of words or phrases\n",
|
14 |
+
" replacement_map = {\n",
|
15 |
+
" # Multiples of ten\n",
|
16 |
+
" 'twenty': ['ट्वेंटी', 'बीस'],\n",
|
17 |
+
" 'thirty': ['थर्टी', 'तीस'],\n",
|
18 |
+
" 'forty': ['फोर्टी', 'चालीस'],\n",
|
19 |
+
" 'fifty': ['फिफ्टी', 'पचास'],\n",
|
20 |
+
" 'sixty': ['सिक्स्टी', 'साठ'],\n",
|
21 |
+
" 'seventy': ['सेवंटी', 'सत्तर','सेवनटी','सेवेनटी','सेवांटी'],\n",
|
22 |
+
" 'eighty': ['एटी', 'अस्सी'],\n",
|
23 |
+
" 'ninety': ['नाइंटी', 'नब्बे'],\n",
|
24 |
+
" \n",
|
25 |
+
" # Numbers from 11 to 19\n",
|
26 |
+
" 'eleven': ['इलेवन', 'ग्यारह','इगारा'],\n",
|
27 |
+
" 'twelve': ['ट्वेल्व', 'बारह'],\n",
|
28 |
+
" 'thirteen': ['थर्टीन', 'तेरह','तेरा'],\n",
|
29 |
+
" 'fourteen': ['फोर्टीन', 'चौदह'],\n",
|
30 |
+
" 'fifteen': ['फिफ्टीन', 'पंद्रह','पंद्रा'],\n",
|
31 |
+
" 'sixteen': ['सिक्स्टीन', 'सोलह','सोल्ला'],\n",
|
32 |
+
" 'seventeen': ['सेवंटीन', 'सत्रह''सतरा'],\n",
|
33 |
+
" 'eighteen': ['एटीन', 'अठारह''अठारा'],\n",
|
34 |
+
" 'nineteen': ['नाइनटीन', 'उन्नीस','उन्नईस','उनाइस'],\n",
|
35 |
+
"\n",
|
36 |
+
" # Numbers from 21 to 29\n",
|
37 |
+
" 'twenty one': ['ट्वेंटी वन', 'इक्कीस'],\n",
|
38 |
+
" 'twenty two': ['ट्वेंटी टू', 'बाईस'],\n",
|
39 |
+
" 'twenty three': ['ट्वेंटी थ्री', 'तेईस'],\n",
|
40 |
+
" 'twenty four': ['ट्वेंटी फोर', 'चौबीस'],\n",
|
41 |
+
" 'twenty five': ['ट्वेंटी फाइव', 'पच्चीस'],\n",
|
42 |
+
" 'twenty six': ['ट्वेंटी सिक्स', 'छब्बीस'],\n",
|
43 |
+
" 'twenty seven': ['ट्वेंटी सेवन', 'सत्ताईस','सताईस'],\n",
|
44 |
+
" 'twenty eight': ['ट्वेंटी एट', 'अट्ठाईस','अठ्ठाइस','अठ्ठाईस'],\n",
|
45 |
+
" 'twenty nine': ['ट्वेंटी नाइन', 'उनतीस'],\n",
|
46 |
+
"\n",
|
47 |
+
" # Numbers from 31 to 39\n",
|
48 |
+
" 'thirty one': ['थर्टी वन', 'इकतीस'],\n",
|
49 |
+
" 'thirty two': ['थर्टी टू', 'बत्तीस'],\n",
|
50 |
+
" 'thirty three': ['थर्टी थ्री', 'तेतीस'],\n",
|
51 |
+
" 'thirty four': ['थर्टी फोर', 'चौंतीस'],\n",
|
52 |
+
" 'thirty five': ['थर्टी फाइव', 'पैंतीस'],\n",
|
53 |
+
" 'thirty six': ['थर्टी सिक्स', 'छत्तीस'],\n",
|
54 |
+
" 'thirty seven': ['थर्टी सेवन', 'सैंतीस'],\n",
|
55 |
+
" 'thirty eight': ['थर्टी एट', 'अड़तीस'],\n",
|
56 |
+
" 'thirty nine': ['थर्टी नाइन', 'उनतालीस'],\n",
|
57 |
+
"\n",
|
58 |
+
" # Numbers from 41 to 49\n",
|
59 |
+
" 'forty one': ['फोर्टी वन', 'इकतालीस'],\n",
|
60 |
+
" 'forty two': ['फोर्टी टू', 'बयालीस'],\n",
|
61 |
+
" 'forty three': ['फोर्टी थ्री', 'तैंतालीस'],\n",
|
62 |
+
" 'forty four': ['फोर्टी फोर', 'चौंतालीस'],\n",
|
63 |
+
" 'forty five': ['फोर्टी फाइव', 'पैंतालीस'],\n",
|
64 |
+
" 'forty six': ['फोर्टी सिक्स', 'छयालिस'],\n",
|
65 |
+
" 'forty seven': ['फोर्टी सेवन', 'सैंतालीस'],\n",
|
66 |
+
" 'forty eight': ['फोर्टी एट', 'अड़तालीस'],\n",
|
67 |
+
" 'forty nine': ['फोर्टी नाइन', 'उनचास'],\n",
|
68 |
+
"\n",
|
69 |
+
" # Numbers from 51 to 59\n",
|
70 |
+
" 'fifty one': ['फिफ्टी वन', 'इक्यावन'],\n",
|
71 |
+
" 'fifty two': ['फिफ्टी टू', 'बावन'],\n",
|
72 |
+
" 'fifty three': ['फिफ्टी थ्री', 'तिरेपन'],\n",
|
73 |
+
" 'fifty four': ['फिफ्टी फोर', 'चौवन'],\n",
|
74 |
+
" 'fifty five': ['फिफ्टी फाइव', 'पचपन'],\n",
|
75 |
+
" 'fifty six': ['फिफ्टी सिक्स', 'छप्पन','छपपन'],\n",
|
76 |
+
" 'fifty seven': ['फिफ्टी सेवन', 'सत्तावन','संताबन','संतावन'],\n",
|
77 |
+
" 'fifty eight': ['फिफ्टी एट', 'अट्��ावन','अंठावन'],\n",
|
78 |
+
" 'fifty nine': ['फिफ्टी नाइन', 'उनसठ','उंसट'],\n",
|
79 |
+
"\n",
|
80 |
+
" # Numbers from 61 to 69\n",
|
81 |
+
" 'sixty one': ['सिक्स्टी वन', 'इकसठ'],\n",
|
82 |
+
" 'sixty two': ['सिक्स्टी टू', 'बासठ'],\n",
|
83 |
+
" 'sixty three': ['सिक्स्टी थ्री', 'तिरसठ'],\n",
|
84 |
+
" 'sixty four': ['सिक्स्टी फोर', 'चौंसठ'],\n",
|
85 |
+
" 'sixty five': ['सिक्स्टी फाइव', 'पैंसठ'],\n",
|
86 |
+
" 'sixty six': ['सिक्स्टी सिक्स', 'छियासठ'],\n",
|
87 |
+
" 'sixty seven': ['सिक्स्टी सेवन', 'सड़सठ'],\n",
|
88 |
+
" 'sixty eight': ['सिक्स्टी एट', 'अड़सठ'],\n",
|
89 |
+
" 'sixty nine': ['सिक्स्टी नाइन', 'उनहत्तर'],\n",
|
90 |
+
"\n",
|
91 |
+
" # Numbers from 71 to 79\n",
|
92 |
+
" 'seventy one': ['सेवंटी वन', 'इकहत्तर','इखत्तर','इकत्तर'],\n",
|
93 |
+
" 'seventy two': ['सेवंटी टू', 'बहत्तर'],\n",
|
94 |
+
" 'seventy three': ['सेवंटी थ्री', 'तिहत्तर','तियत्र','तियत्तर','तीहत्तर','तिहत्थर'],\n",
|
95 |
+
" 'seventy four': ['सेवंटी फोर', 'चौहत्तर',],\n",
|
96 |
+
" 'seventy five': ['सेवंटी फाइव', 'पचहत्तर','पछत्तर','पिछत्तर','पचहत्तर','पचत्तर'],\n",
|
97 |
+
" 'seventy six': ['सेवंटी सिक्स', 'छिहत्तर','छीहत्तर'],\n",
|
98 |
+
" 'seventy seven': ['सेवंटी सेवन', 'सतहत्तर','सतात्तर','सतत्तर','सतहत्थर'],\n",
|
99 |
+
" 'seventy eight': ['सेवंटी एट', 'अठहत्तर','अठत्तर'],\n",
|
100 |
+
" 'seventy nine': ['सेवंटी नाइन', 'उन्यासी','उनासी'],\n",
|
101 |
+
"\n",
|
102 |
+
" # Numbers from 81 to 89\n",
|
103 |
+
" 'eighty one': ['एटी वन', 'इक्यासी'],\n",
|
104 |
+
" 'eighty two': ['एटी टू', 'बयासी'],\n",
|
105 |
+
" 'eighty three': ['एटी थ्री', 'तिरासी'],\n",
|
106 |
+
" 'eighty four': ['एटी फोर', 'चौरासी'],\n",
|
107 |
+
" 'eighty five': ['एटी फाइव', 'पचासी'],\n",
|
108 |
+
" 'eighty six': ['एटी सिक्स', 'छियासी'],\n",
|
109 |
+
" 'eighty seven': ['एटी सेवन', 'सतासी'],\n",
|
110 |
+
" 'eighty eight': ['एटी एट', 'अठासी'],\n",
|
111 |
+
" 'eighty nine': ['एटी नाइन', 'नवासी'],\n",
|
112 |
+
"\n",
|
113 |
+
" # Numbers from 91 to 99\n",
|
114 |
+
" 'ninety one': ['नाइंटी वन', 'इक्यानवे'],\n",
|
115 |
+
" 'ninety two': ['नाइंटी टू', 'बानवे'],\n",
|
116 |
+
" 'ninety three': ['नाइंटी थ्री', 'तिरानवे'],\n",
|
117 |
+
" 'ninety four': ['नाइंटी फोर', 'चौरानवे'],\n",
|
118 |
+
" 'ninety five': ['नाइंटी फाइव', 'पचानवे'],\n",
|
119 |
+
" 'ninety six': ['नाइंटी सिक्स', 'छियानवे'],\n",
|
120 |
+
" 'ninety seven': ['नाइंटी सेवन', 'सतानवे'],\n",
|
121 |
+
" 'ninety eight': ['नाइंटी एट', 'अठानवे'],\n",
|
122 |
+
" 'ninety nine': ['नाइंटी नाइन', 'निन्यानवे'],\n",
|
123 |
+
" # Numbers from one to ten\n",
|
124 |
+
" 'seven': ['सेवन', 'सात'],\n",
|
125 |
+
" 'zero': ['शून्य', 'जेरो', 'शुन्ना', 'जीरो'],\n",
|
126 |
+
" 'one': ['वन', 'एंक', 'इक', 'एक'],\n",
|
127 |
+
" 'two': ['टू', 'दो'],\n",
|
128 |
+
" 'three': ['थ्री', 'तीना', 'तीन', 'त्री'],\n",
|
129 |
+
" 'four': ['फोर', 'फॉर', 'च्यार', 'चार'],\n",
|
130 |
+
" 'five': ['फाइव', 'पाँच', 'पांच'],\n",
|
131 |
+
" 'six': ['सिक्स', 'चह', 'छौ', 'छै', 'छह', 'छे'],\n",
|
132 |
+
" 'eight': ['एट', 'अट', 'आठ'],\n",
|
133 |
+
" 'nine': ['नाइन', 'नौ'],\n",
|
134 |
+
" 'ten': ['टेन', 'दस'],\n",
|
135 |
+
" # Hundred\n",
|
136 |
+
" 'hundred': ['हंड्रेड', 'सौ','सो','साव'],\n",
|
137 |
+
" # Thousand\n",
|
138 |
+
" 'thousand' : ['हजार','थौजनड','थाउजंड','हज़ार'],\n",
|
139 |
+
" # Lakhs\n",
|
140 |
+
" 'lac' : ['लाख'],\n",
|
141 |
+
"\n",
|
142 |
+
" # Special for double digits\n",
|
143 |
+
" 'डबल': ['दबल', 'डबल', 'दुबाल'],\n",
|
144 |
+
" }\n",
|
145 |
+
"\n",
|
146 |
+
" words = sentence.split() # Split the sentence by spaces\n",
|
147 |
+
"\n",
|
148 |
+
" # Replace words using the mapping\n",
|
149 |
+
" for i, word in enumerate(words):\n",
|
150 |
+
" for replacement, patterns in replacement_map.items():\n",
|
151 |
+
" if word in patterns:\n",
|
152 |
+
" words[i] = replacement # Replace the word if it's fully matched\n",
|
153 |
+
"\n",
|
154 |
+
" # Join the processed words back into a sentence\n",
|
155 |
+
" return ' '.join(words)"
|
156 |
+
]
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"cell_type": "code",
|
160 |
+
"execution_count": null,
|
161 |
+
"id": "7bdb593a-cb68-4b04-af8d-b61ea396a5eb",
|
162 |
+
"metadata": {},
|
163 |
+
"outputs": [],
|
164 |
+
"source": []
|
165 |
+
}
|
166 |
+
],
|
167 |
+
"metadata": {
|
168 |
+
"kernelspec": {
|
169 |
+
"display_name": "Python 3 (ipykernel)",
|
170 |
+
"language": "python",
|
171 |
+
"name": "python3"
|
172 |
+
},
|
173 |
+
"language_info": {
|
174 |
+
"codemirror_mode": {
|
175 |
+
"name": "ipython",
|
176 |
+
"version": 3
|
177 |
+
},
|
178 |
+
"file_extension": ".py",
|
179 |
+
"mimetype": "text/x-python",
|
180 |
+
"name": "python",
|
181 |
+
"nbconvert_exporter": "python",
|
182 |
+
"pygments_lexer": "ipython3",
|
183 |
+
"version": "3.11.7"
|
184 |
+
}
|
185 |
+
},
|
186 |
+
"nbformat": 4,
|
187 |
+
"nbformat_minor": 5
|
188 |
+
}
|
text2int.ipynb
ADDED
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"id": "61185b34-45e0-4a78-a84b-2cedd08ad39a",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [],
|
9 |
+
"source": [
|
10 |
+
"# # Function to convert Hindi text to numerical representation\n",
|
11 |
+
"# from isNumber import is_number\n",
|
12 |
+
"\n",
|
13 |
+
"# def text_to_int (textnum, numwords={}):\n",
|
14 |
+
"# units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',\n",
|
15 |
+
"# 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',\n",
|
16 |
+
"# 'sixteen', 'seventeen', 'eighteen', 'nineteen']\n",
|
17 |
+
"# tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']\n",
|
18 |
+
"# scales = ['hundred', 'thousand', 'lac','million', 'billion', 'trillion']\n",
|
19 |
+
"# ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}\n",
|
20 |
+
"# ordinal_endings = [('ieth', 'y'), ('th', '')]\n",
|
21 |
+
"\n",
|
22 |
+
"# if not numwords:\n",
|
23 |
+
"# numwords['and'] = (1, 0)\n",
|
24 |
+
"# for idx, word in enumerate(units): numwords[word] = (1, idx)\n",
|
25 |
+
"# for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)\n",
|
26 |
+
"# for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)\n",
|
27 |
+
"\n",
|
28 |
+
"# textnum = textnum.replace('-', ' ')\n",
|
29 |
+
"\n",
|
30 |
+
"# current = result = 0\n",
|
31 |
+
"# curstring = ''\n",
|
32 |
+
"# onnumber = False\n",
|
33 |
+
"# lastunit = False\n",
|
34 |
+
"# lastscale = False\n",
|
35 |
+
"\n",
|
36 |
+
"# def is_numword(x):\n",
|
37 |
+
"# if is_number(x):\n",
|
38 |
+
"# return True\n",
|
39 |
+
"# if word in numwords:\n",
|
40 |
+
"# return True\n",
|
41 |
+
"# return False\n",
|
42 |
+
"\n",
|
43 |
+
"# def from_numword(x):\n",
|
44 |
+
"# if is_number(x):\n",
|
45 |
+
"# scale = 0\n",
|
46 |
+
"# increment = int(x.replace(',', ''))\n",
|
47 |
+
"# return scale, increment\n",
|
48 |
+
"# return numwords[x]\n",
|
49 |
+
"\n",
|
50 |
+
"# for word in textnum.split():\n",
|
51 |
+
"# if word in ordinal_words:\n",
|
52 |
+
"# scale, increment = (1, ordinal_words[word])\n",
|
53 |
+
"# current = current * scale + increment\n",
|
54 |
+
"# if scale > 100:\n",
|
55 |
+
"# result += current\n",
|
56 |
+
"# current = 0\n",
|
57 |
+
"# onnumber = True\n",
|
58 |
+
"# lastunit = False\n",
|
59 |
+
"# lastscale = False\n",
|
60 |
+
"# else:\n",
|
61 |
+
"# for ending, replacement in ordinal_endings:\n",
|
62 |
+
"# if word.endswith(ending):\n",
|
63 |
+
"# word = \"%s%s\" % (word[:-len(ending)], replacement)\n",
|
64 |
+
"\n",
|
65 |
+
"# if (not is_numword(word)) or (word == 'and' and not lastscale):\n",
|
66 |
+
"# if onnumber:\n",
|
67 |
+
"# # Flush the current number we are building\n",
|
68 |
+
"# curstring += repr(result + current) + \" \"\n",
|
69 |
+
"# curstring += word + \" \"\n",
|
70 |
+
"# result = current = 0\n",
|
71 |
+
"# onnumber = False\n",
|
72 |
+
"# lastunit = False\n",
|
73 |
+
"# lastscale = False\n",
|
74 |
+
"# else:\n",
|
75 |
+
"# scale, increment = from_numword(word)\n",
|
76 |
+
"# onnumber = True\n",
|
77 |
+
"\n",
|
78 |
+
"# if lastunit and (word not in scales): \n",
|
79 |
+
"# # Assume this is part of a string of individual numbers to \n",
|
80 |
+
"# # be flushed, such as a zipcode \"one two three four five\" \n",
|
81 |
+
"# curstring += repr(result + current) \n",
|
82 |
+
"# result = current = 0 \n",
|
83 |
+
"\n",
|
84 |
+
"# if scale > 1: \n",
|
85 |
+
"# current = max(1, current) \n",
|
86 |
+
"\n",
|
87 |
+
"# current = current * scale + increment \n",
|
88 |
+
"# if scale > 100: \n",
|
89 |
+
"# result += current \n",
|
90 |
+
"# current = 0 \n",
|
91 |
+
"\n",
|
92 |
+
"# lastscale = False \n",
|
93 |
+
"# lastunit = False \n",
|
94 |
+
"# if word in scales: \n",
|
95 |
+
"# lastscale = True \n",
|
96 |
+
"# elif word in units: \n",
|
97 |
+
"# lastunit = True\n",
|
98 |
+
"\n",
|
99 |
+
"# if onnumber:\n",
|
100 |
+
"# curstring += repr(result + current)\n",
|
101 |
+
"\n",
|
102 |
+
"# return curstring\n"
|
103 |
+
]
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"cell_type": "code",
|
107 |
+
"execution_count": null,
|
108 |
+
"id": "a87b26d7-4a0e-4fdc-b03e-1537600faf65",
|
109 |
+
"metadata": {},
|
110 |
+
"outputs": [],
|
111 |
+
"source": [
|
112 |
+
"from isNumber import is_number # Remove or replace this if unnecessary\n",
|
113 |
+
"\n",
|
114 |
+
"def text_to_int(textnum, numwords={}):\n",
|
115 |
+
" # Define units, tens, and scales including \"lac\"\n",
|
116 |
+
" units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',\n",
|
117 |
+
" 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',\n",
|
118 |
+
" 'sixteen', 'seventeen', 'eighteen', 'nineteen']\n",
|
119 |
+
" tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']\n",
|
120 |
+
" scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion'] # \"lac\" added\n",
|
121 |
+
" ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}\n",
|
122 |
+
" ordinal_endings = [('ieth', 'y'), ('th', '')]\n",
|
123 |
+
"\n",
|
124 |
+
" if not numwords:\n",
|
125 |
+
" numwords['and'] = (1, 0) # Handle \"one hundred and twenty\"\n",
|
126 |
+
" \n",
|
127 |
+
" # Add units, tens, and scales to numwords\n",
|
128 |
+
" for idx, word in enumerate(units):\n",
|
129 |
+
" numwords[word] = (1, idx)\n",
|
130 |
+
" for idx, word in enumerate(tens):\n",
|
131 |
+
" numwords[word] = (1, idx * 10)\n",
|
132 |
+
" \n",
|
133 |
+
" for idx, word in enumerate(scales):\n",
|
134 |
+
" numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0) # Handle \"lac\" as 10^5\n",
|
135 |
+
"\n",
|
136 |
+
" # Remove hyphens and normalize input\n",
|
137 |
+
" textnum = textnum.replace('-', ' ')\n",
|
138 |
+
"\n",
|
139 |
+
" current = result = 0\n",
|
140 |
+
" curstring = ''\n",
|
141 |
+
" onnumber = False\n",
|
142 |
+
" lastunit = False\n",
|
143 |
+
" lastscale = False\n",
|
144 |
+
"\n",
|
145 |
+
" def is_numword(x):\n",
|
146 |
+
" return is_number(x) or x in numwords\n",
|
147 |
+
"\n",
|
148 |
+
" def from_numword(x):\n",
|
149 |
+
" if is_number(x):\n",
|
150 |
+
" return 0, int(x.replace(',', ''))\n",
|
151 |
+
" return numwords[x]\n",
|
152 |
+
"\n",
|
153 |
+
" for word in textnum.split():\n",
|
154 |
+
" if word in ordinal_words:\n",
|
155 |
+
" scale, increment = (1, ordinal_words[word])\n",
|
156 |
+
" current = current * scale + increment\n",
|
157 |
+
" if scale > 100:\n",
|
158 |
+
" result += current\n",
|
159 |
+
" current = 0\n",
|
160 |
+
" onnumber = True\n",
|
161 |
+
" lastunit = False\n",
|
162 |
+
" lastscale = False\n",
|
163 |
+
" else:\n",
|
164 |
+
" for ending, replacement in ordinal_endings:\n",
|
165 |
+
" if word.endswith(ending):\n",
|
166 |
+
" word = f\"{word[:-len(ending)]}{replacement}\"\n",
|
167 |
+
"\n",
|
168 |
+
" if not is_numword(word) or (word == 'and' and not lastscale):\n",
|
169 |
+
" if onnumber:\n",
|
170 |
+
" curstring += repr(result + current) + \" \"\n",
|
171 |
+
" curstring += word + \" \"\n",
|
172 |
+
" result = current = 0\n",
|
173 |
+
" onnumber = False\n",
|
174 |
+
" lastunit = False\n",
|
175 |
+
" lastscale = False\n",
|
176 |
+
" else:\n",
|
177 |
+
" scale, increment = from_numword(word)\n",
|
178 |
+
" onnumber = True\n",
|
179 |
+
"\n",
|
180 |
+
" if lastunit and word not in scales:\n",
|
181 |
+
" curstring += repr(result + current) + \" \"\n",
|
182 |
+
" result = current = 0\n",
|
183 |
+
"\n",
|
184 |
+
" if scale > 1:\n",
|
185 |
+
" current = max(1, current)\n",
|
186 |
+
"\n",
|
187 |
+
" current = current * scale + increment\n",
|
188 |
+
"\n",
|
189 |
+
" if scale >= 100:\n",
|
190 |
+
" result += current\n",
|
191 |
+
" current = 0\n",
|
192 |
+
"\n",
|
193 |
+
" lastscale = word in scales\n",
|
194 |
+
" lastunit = word in units\n",
|
195 |
+
"\n",
|
196 |
+
" if onnumber:\n",
|
197 |
+
" curstring += repr(result + current)\n",
|
198 |
+
"\n",
|
199 |
+
" return curstring.strip()"
|
200 |
+
]
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"cell_type": "code",
|
204 |
+
"execution_count": null,
|
205 |
+
"id": "83997c73-e1b4-4863-b1df-d6de6153e80d",
|
206 |
+
"metadata": {},
|
207 |
+
"outputs": [],
|
208 |
+
"source": []
|
209 |
+
}
|
210 |
+
],
|
211 |
+
"metadata": {
|
212 |
+
"kernelspec": {
|
213 |
+
"display_name": "Python 3 (ipykernel)",
|
214 |
+
"language": "python",
|
215 |
+
"name": "python3"
|
216 |
+
},
|
217 |
+
"language_info": {
|
218 |
+
"codemirror_mode": {
|
219 |
+
"name": "ipython",
|
220 |
+
"version": 3
|
221 |
+
},
|
222 |
+
"file_extension": ".py",
|
223 |
+
"mimetype": "text/x-python",
|
224 |
+
"name": "python",
|
225 |
+
"nbconvert_exporter": "python",
|
226 |
+
"pygments_lexer": "ipython3",
|
227 |
+
"version": "3.11.7"
|
228 |
+
}
|
229 |
+
},
|
230 |
+
"nbformat": 4,
|
231 |
+
"nbformat_minor": 5
|
232 |
+
}
|