cdactvm commited on
Commit
bdfde60
·
verified ·
1 Parent(s): a8e84bc

Update convert2list.py

Browse files
Files changed (1) hide show
  1. convert2list.py +89 -108
convert2list.py CHANGED
@@ -1,108 +1,89 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- # In[ ]:
5
-
6
-
7
- # import nbimporter
8
- import nbimporter
9
- from Text2List import text_to_list
10
- def convert_to_list(text, text_list):
11
- matched_words = []
12
- unmatched_text = '' # To accumulate unmatched characters
13
-
14
- # Sort text_list by length in descending order to prioritize longest matches first
15
- text_list_sorted = sorted(text_list, key=len, reverse=True)
16
-
17
- while text:
18
- matched = False
19
- for word in text_list_sorted:
20
- if text.startswith(word):
21
- # Add any accumulated unmatched text before appending the matched word
22
- if unmatched_text:
23
- matched_words.append(unmatched_text)
24
- unmatched_text = '' # Reset unmatched text accumulator
25
-
26
- matched_words.append(word)
27
- text = text[len(word):] # Remove the matched part from text
28
- matched = True
29
- break
30
-
31
- if not matched:
32
- # Accumulate unmatched characters
33
- unmatched_text += text[0]
34
- text = text[1:]
35
-
36
- # If there's any remaining unmatched text, add it to the result
37
- if unmatched_text:
38
- matched_words.append(unmatched_text)
39
-
40
- # Join matched words and unmatched text with a space
41
- result = ' '.join(matched_words)
42
- return result
43
-
44
- # text = "जीरोएकदोतीनचारपांचछहसातआठनौदसजीरोएकदोतीनचारपांच"
45
-
46
- if __name__=="__main__":
47
- converted=convert_to_list(text, text_to_list())
48
- print(converted)
49
-
50
-
51
- # In[ ]:
52
-
53
-
54
- # # import nbimporter
55
- # import nbimporter
56
- # from Text2List import text_to_list
57
- # def convert_to_list(text, text_list):
58
- # matched_words = []
59
- # unmatched_text = '' # To accumulate unmatched characters
60
-
61
- # # Sort text_list by length in descending order to prioritize longest matches first
62
- # text_list_sorted = sorted(text_list, key=len, reverse=True)
63
-
64
- # while text:
65
- # matched = False
66
- # for word in text_list_sorted:
67
- # if word in text:
68
- # # Add any accumulated unmatched text before appending the matched word
69
- # if unmatched_text:
70
- # matched_words.append(unmatched_text)
71
- # unmatched_text = '' # Reset unmatched text accumulator
72
-
73
- # matched_words.append(word)
74
- # text = text[len(word):] # Remove the matched part from text
75
- # matched = True
76
- # break
77
-
78
- # if not matched:
79
- # # Accumulate unmatched characters
80
- # unmatched_text += text[0]
81
- # text = text[1:]
82
-
83
- # # If there's any remaining unmatched text, add it to the result
84
- # if unmatched_text:
85
- # matched_words.append(unmatched_text)
86
-
87
- # # Join matched words and unmatched text with a space
88
- # result = ' '.join(matched_words)
89
- # return result
90
-
91
- # text = "जीरोएकदोतीनचार"
92
-
93
- # if __name__=="__main__":
94
- # converted=convert_to_list(text, text_to_list())
95
- # print(converted)
96
-
97
-
98
- # In[ ]:
99
-
100
-
101
- get_ipython().system('git clone https://huggingface.co/StephennFernandes/wav2vec2-XLS-R-300m-konkani')
102
-
103
-
104
- # In[ ]:
105
-
106
-
107
-
108
-
 
1
+ # import nbimporter
2
+ import nbimporter
3
+ from Text2List import text_to_list
4
+ def convert_to_list(text, text_list):
5
+ matched_words = []
6
+ unmatched_text = '' # To accumulate unmatched characters
7
+
8
+ # Sort text_list by length in descending order to prioritize longest matches first
9
+ text_list_sorted = sorted(text_list, key=len, reverse=True)
10
+
11
+ while text:
12
+ matched = False
13
+ for word in text_list_sorted:
14
+ if text.startswith(word):
15
+ # Add any accumulated unmatched text before appending the matched word
16
+ if unmatched_text:
17
+ matched_words.append(unmatched_text)
18
+ unmatched_text = '' # Reset unmatched text accumulator
19
+
20
+ matched_words.append(word)
21
+ text = text[len(word):] # Remove the matched part from text
22
+ matched = True
23
+ break
24
+
25
+ if not matched:
26
+ # Accumulate unmatched characters
27
+ unmatched_text += text[0]
28
+ text = text[1:]
29
+
30
+ # If there's any remaining unmatched text, add it to the result
31
+ if unmatched_text:
32
+ matched_words.append(unmatched_text)
33
+
34
+ # Join matched words and unmatched text with a space
35
+ result = ' '.join(matched_words)
36
+ return result
37
+
38
+ # text = "जीरोएकदोतीनचारपांचछहसातआठनौदसजीरोएकदोतीनचारपांच"
39
+
40
+ if __name__=="__main__":
41
+ converted=convert_to_list(text, text_to_list())
42
+ print(converted)
43
+
44
+
45
+ # In[ ]:
46
+
47
+
48
+ # # import nbimporter
49
+ # import nbimporter
50
+ # from Text2List import text_to_list
51
+ # def convert_to_list(text, text_list):
52
+ # matched_words = []
53
+ # unmatched_text = '' # To accumulate unmatched characters
54
+
55
+ # # Sort text_list by length in descending order to prioritize longest matches first
56
+ # text_list_sorted = sorted(text_list, key=len, reverse=True)
57
+
58
+ # while text:
59
+ # matched = False
60
+ # for word in text_list_sorted:
61
+ # if word in text:
62
+ # # Add any accumulated unmatched text before appending the matched word
63
+ # if unmatched_text:
64
+ # matched_words.append(unmatched_text)
65
+ # unmatched_text = '' # Reset unmatched text accumulator
66
+
67
+ # matched_words.append(word)
68
+ # text = text[len(word):] # Remove the matched part from text
69
+ # matched = True
70
+ # break
71
+
72
+ # if not matched:
73
+ # # Accumulate unmatched characters
74
+ # unmatched_text += text[0]
75
+ # text = text[1:]
76
+
77
+ # # If there's any remaining unmatched text, add it to the result
78
+ # if unmatched_text:
79
+ # matched_words.append(unmatched_text)
80
+
81
+ # # Join matched words and unmatched text with a space
82
+ # result = ' '.join(matched_words)
83
+ # return result
84
+
85
+ # text = "जीरोएकदोतीनचार"
86
+
87
+ # if __name__=="__main__":
88
+ # converted=convert_to_list(text, text_to_list())
89
+ # print(converted)