File size: 5,338 Bytes
17bc303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import json
import os
import re
from os.path import join as join_path
import gradio as gr

from text_split import split_by_sentences


class RUAccent:
    vowels = "аеёиоуыэюя"

    def __init__(self):
        self.omographs = None
        self.accents = None
        self.workdir = os.getcwd()

    def load(self, custom_dict=None, custom_homographs=None):
        if custom_homographs is None:
            custom_homographs = {}

        if custom_dict is None:
            custom_dict = {}

        self.omographs = json.load(
            open(
                join_path(self.workdir, "dictionaries", "omographs.json"),
                encoding="utf-8",
            )
        )

        self.omographs.update(custom_homographs)

        self.accents = json.load(
            open(
                join_path(self.workdir, "dictionaries", "accents.json"),
                encoding="utf-8",
            )
        )

        self.accents.update(custom_dict)

        # self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')

    def split_by_words(self, string):
        result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
        return [res for res in result if res]

    def process_all(self, text):
        """Ядро всей программы. Тут текст проходит через ряд функций,
        где по итогу получается строка с проставленными ударениями
        Input:
        text: string

        Output:
        accented_sentence: list[string]
        omographs_list: list[string]
        unknown_list: list[string]
        """
        accented_sentence = []
        omographs_list = []
        unknown_list = []

        sentences = split_by_sentences(text)
        outputs = []
        for sentence in sentences:
            text = self.split_by_words(sentence)
            # processed_text = self._process_yo(text)

            # processed_text = self._process_omographs(text)
            founded_omographs = self._process_omographs(text)
            omographs_list.extend(founded_omographs)

            processed_text, unknown_words = self._process_accent(
                text, founded_omographs
            )
            unknown_list.extend(unknown_words)

            processed_text = " ".join(processed_text)
            processed_text = self.delete_spaces_before_punc(processed_text)
            # outputs.append(processed_text)

            accented_sentence.append(processed_text)
            # " ".join(outputs)

        omographs_list = [
            f"{key}: {value}" for elem in omographs_list for key, value in elem.items()
        ]
        return accented_sentence, omographs_list, unknown_list

    def _process_yo(self, text):
        splitted_text = text

        for i, word in enumerate(splitted_text):
            splitted_text[i] = self.yo_words.get(word, word)
        return splitted_text

    def _process_omographs(self, text):
        splitted_text = text

        founded_omographs = []
        for i, word in enumerate(splitted_text):
            variants = self.omographs.get(word)
            if variants:
                founded_omographs.append({word: variants})

        # for omograph in founded_omographs:
        #     splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
        #     cls = omograph["variants"][0]  # Just take the first variant from the dictionary
        #     splitted_text[omograph["position"]] = cls
        # return splitted_text
        return founded_omographs

    def _process_accent(self, text, founded_omographs):
        splitted_text = text
        unknown_words = []
        for i, word in enumerate(splitted_text):
            stressed_word = self.accents.get(word, word)
            if stressed_word == word:
                # if len(word) > 4:
                if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
                    unknown_words.append(word)
                splitted_text[i] = word

            elif stressed_word != word and word in [
                list(d.keys())[0] for d in founded_omographs
            ]:
                splitted_text[i] = word

            else:
                splitted_text[i] = stressed_word

            # stressed_word = self.accents.get(word, word)
            # splitted_text[i] = stressed_word

        return splitted_text, unknown_words

    def delete_spaces_before_punc(self, text):
        punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
        for char in punc:
            text = text.replace(" " + char, char)
        return text


ru_accent = RUAccent()
ru_accent.load()

title = "Демо для модели расстановки ударения на русском языке"


description = "Для расстановки ударения те"

outputs = [
    gr.Textbox(label="Обработанный текст"),
    gr.Textbox(label="Омографы"),
    gr.Textbox(label="Нет в словаре"),
]

theme = "huggingface"

interface = gr.Interface(
    fn=ru_accent.process_all,
    inputs=gr.Textbox(label="текст для расстановкит ударения"),
    outputs=outputs,
    title=title,
    description=description,
)

if __name__ == "__main__":
    interface.launch(debug=True, share=True)