Shakhovak commited on
Commit
17bc303
1 Parent(s): a49c1a1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -0
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from os.path import join as join_path
5
+ import gradio as gr
6
+
7
+ from text_split import split_by_sentences
8
+
9
+
10
+ class RUAccent:
11
+ vowels = "аеёиоуыэюя"
12
+
13
+ def __init__(self):
14
+ self.omographs = None
15
+ self.accents = None
16
+ self.workdir = os.getcwd()
17
+
18
+ def load(self, custom_dict=None, custom_homographs=None):
19
+ if custom_homographs is None:
20
+ custom_homographs = {}
21
+
22
+ if custom_dict is None:
23
+ custom_dict = {}
24
+
25
+ self.omographs = json.load(
26
+ open(
27
+ join_path(self.workdir, "dictionaries", "omographs.json"),
28
+ encoding="utf-8",
29
+ )
30
+ )
31
+
32
+ self.omographs.update(custom_homographs)
33
+
34
+ self.accents = json.load(
35
+ open(
36
+ join_path(self.workdir, "dictionaries", "accents.json"),
37
+ encoding="utf-8",
38
+ )
39
+ )
40
+
41
+ self.accents.update(custom_dict)
42
+
43
+ # self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')
44
+
45
+ def split_by_words(self, string):
46
+ result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
47
+ return [res for res in result if res]
48
+
49
+ def process_all(self, text):
50
+ """Ядро всей программы. Тут текст проходит через ряд функций,
51
+ где по итогу получается строка с проставленными ударениями
52
+ Input:
53
+ text: string
54
+
55
+ Output:
56
+ accented_sentence: list[string]
57
+ omographs_list: list[string]
58
+ unknown_list: list[string]
59
+ """
60
+ accented_sentence = []
61
+ omographs_list = []
62
+ unknown_list = []
63
+
64
+ sentences = split_by_sentences(text)
65
+ outputs = []
66
+ for sentence in sentences:
67
+ text = self.split_by_words(sentence)
68
+ # processed_text = self._process_yo(text)
69
+
70
+ # processed_text = self._process_omographs(text)
71
+ founded_omographs = self._process_omographs(text)
72
+ omographs_list.extend(founded_omographs)
73
+
74
+ processed_text, unknown_words = self._process_accent(
75
+ text, founded_omographs
76
+ )
77
+ unknown_list.extend(unknown_words)
78
+
79
+ processed_text = " ".join(processed_text)
80
+ processed_text = self.delete_spaces_before_punc(processed_text)
81
+ # outputs.append(processed_text)
82
+
83
+ accented_sentence.append(processed_text)
84
+ # " ".join(outputs)
85
+
86
+ omographs_list = [
87
+ f"{key}: {value}" for elem in omographs_list for key, value in elem.items()
88
+ ]
89
+ return accented_sentence, omographs_list, unknown_list
90
+
91
+ def _process_yo(self, text):
92
+ splitted_text = text
93
+
94
+ for i, word in enumerate(splitted_text):
95
+ splitted_text[i] = self.yo_words.get(word, word)
96
+ return splitted_text
97
+
98
+ def _process_omographs(self, text):
99
+ splitted_text = text
100
+
101
+ founded_omographs = []
102
+ for i, word in enumerate(splitted_text):
103
+ variants = self.omographs.get(word)
104
+ if variants:
105
+ founded_omographs.append({word: variants})
106
+
107
+ # for omograph in founded_omographs:
108
+ # splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
109
+ # cls = omograph["variants"][0] # Just take the first variant from the dictionary
110
+ # splitted_text[omograph["position"]] = cls
111
+ # return splitted_text
112
+ return founded_omographs
113
+
114
+ def _process_accent(self, text, founded_omographs):
115
+ splitted_text = text
116
+ unknown_words = []
117
+ for i, word in enumerate(splitted_text):
118
+ stressed_word = self.accents.get(word, word)
119
+ if stressed_word == word:
120
+ # if len(word) > 4:
121
+ if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
122
+ unknown_words.append(word)
123
+ splitted_text[i] = word
124
+
125
+ elif stressed_word != word and word in [
126
+ list(d.keys())[0] for d in founded_omographs
127
+ ]:
128
+ splitted_text[i] = word
129
+
130
+ else:
131
+ splitted_text[i] = stressed_word
132
+
133
+ # stressed_word = self.accents.get(word, word)
134
+ # splitted_text[i] = stressed_word
135
+
136
+ return splitted_text, unknown_words
137
+
138
+ def delete_spaces_before_punc(self, text):
139
+ punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
140
+ for char in punc:
141
+ text = text.replace(" " + char, char)
142
+ return text
143
+
144
+
145
+ ru_accent = RUAccent()
146
+ ru_accent.load()
147
+
148
+ title = "Демо для модели расстановки ударения на русском языке"
149
+
150
+
151
+ description = "Для расстановки ударения те"
152
+
153
+ outputs = [
154
+ gr.Textbox(label="Обработанный текст"),
155
+ gr.Textbox(label="Омографы"),
156
+ gr.Textbox(label="Нет в словаре"),
157
+ ]
158
+
159
+ theme = "huggingface"
160
+
161
+ interface = gr.Interface(
162
+ fn=ru_accent.process_all,
163
+ inputs=gr.Textbox(label="текст для расстановкит ударения"),
164
+ outputs=outputs,
165
+ title=title,
166
+ description=description,
167
+ )
168
+
169
+ if __name__ == "__main__":
170
+ interface.launch(debug=True, share=True)