Spaces:

Shakhovak
/

RU_ACCENT

Sleeping

App Files Files Community

Shakhovak commited on Dec 15, 2023

Commit

17bc303

•

1 Parent(s): a49c1a1

Create app.py

Browse files

Files changed (1) hide show

app.py +170 -0

app.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import json
+import os
+import re
+from os.path import join as join_path
+import gradio as gr
+from text_split import split_by_sentences
+class RUAccent:
+    vowels = "аеёиоуыэюя"
+    def __init__(self):
+        self.omographs = None
+        self.accents = None
+        self.workdir = os.getcwd()
+    def load(self, custom_dict=None, custom_homographs=None):
+        if custom_homographs is None:
+            custom_homographs = {}
+        if custom_dict is None:
+            custom_dict = {}
+        self.omographs = json.load(
+            open(
+                join_path(self.workdir, "dictionaries", "omographs.json"),
+                encoding="utf-8",
+            )
+        )
+        self.omographs.update(custom_homographs)
+        self.accents = json.load(
+            open(
+                join_path(self.workdir, "dictionaries", "accents.json"),
+                encoding="utf-8",
+            )
+        )
+        self.accents.update(custom_dict)
+        # self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')
+    def split_by_words(self, string):
+        result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
+        return [res for res in result if res]
+    def process_all(self, text):
+        """Ядро всей программы. Тут текст проходит через ряд функций,
+        где по итогу получается строка с проставленными ударениями
+        Input:
+        text: string
+        Output:
+        accented_sentence: list[string]
+        omographs_list: list[string]
+        unknown_list: list[string]
+        """
+        accented_sentence = []
+        omographs_list = []
+        unknown_list = []
+        sentences = split_by_sentences(text)
+        outputs = []
+        for sentence in sentences:
+            text = self.split_by_words(sentence)
+            # processed_text = self._process_yo(text)
+            # processed_text = self._process_omographs(text)
+            founded_omographs = self._process_omographs(text)
+            omographs_list.extend(founded_omographs)
+            processed_text, unknown_words = self._process_accent(
+                text, founded_omographs
+            )
+            unknown_list.extend(unknown_words)
+            processed_text = " ".join(processed_text)
+            processed_text = self.delete_spaces_before_punc(processed_text)
+            # outputs.append(processed_text)
+            accented_sentence.append(processed_text)
+            # " ".join(outputs)
+        omographs_list = [
+            f"{key}: {value}" for elem in omographs_list for key, value in elem.items()
+        ]
+        return accented_sentence, omographs_list, unknown_list
+    def _process_yo(self, text):
+        splitted_text = text
+        for i, word in enumerate(splitted_text):
+            splitted_text[i] = self.yo_words.get(word, word)
+        return splitted_text
+    def _process_omographs(self, text):
+        splitted_text = text
+        founded_omographs = []
+        for i, word in enumerate(splitted_text):
+            variants = self.omographs.get(word)
+            if variants:
+                founded_omographs.append({word: variants})
+        # for omograph in founded_omographs:
+        #     splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
+        #     cls = omograph["variants"][0]  # Just take the first variant from the dictionary
+        #     splitted_text[omograph["position"]] = cls
+        # return splitted_text
+        return founded_omographs
+    def _process_accent(self, text, founded_omographs):
+        splitted_text = text
+        unknown_words = []
+        for i, word in enumerate(splitted_text):
+            stressed_word = self.accents.get(word, word)
+            if stressed_word == word:
+                # if len(word) > 4:
+                if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
+                    unknown_words.append(word)
+                splitted_text[i] = word
+            elif stressed_word != word and word in [
+                list(d.keys())[0] for d in founded_omographs
+            ]:
+                splitted_text[i] = word
+            else:
+                splitted_text[i] = stressed_word
+            # stressed_word = self.accents.get(word, word)
+            # splitted_text[i] = stressed_word
+        return splitted_text, unknown_words
+    def delete_spaces_before_punc(self, text):
+        punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
+        for char in punc:
+            text = text.replace(" " + char, char)
+        return text
+ru_accent = RUAccent()
+ru_accent.load()
+title = "Демо для модели расстановки ударения на русском языке"
+description = "Для расстановки ударения те"
+outputs = [
+    gr.Textbox(label="Обработанный текст"),
+    gr.Textbox(label="Омографы"),
+    gr.Textbox(label="Нет в словаре"),
+]
+theme = "huggingface"
+interface = gr.Interface(
+    fn=ru_accent.process_all,
+    inputs=gr.Textbox(label="текст для расстановкит ударения"),
+    outputs=outputs,
+    title=title,
+    description=description,
+)
+if __name__ == "__main__":
+    interface.launch(debug=True, share=True)