Spaces:

Shakhovak
/

RU_ACCENT

Sleeping

App Files Files Community

shakhovak commited on Dec 18, 2023

Commit

5af977b

•

1 Parent(s): b54d798

revert

Browse files

Files changed (4) hide show

app.py +112 -31
dictionaries/accents.json +0 -3
dictionaries/omographs.json +0 -0
ruaccent.py +0 -142

app.py CHANGED Viewed

@@ -1,44 +1,125 @@
-from flask import Flask, render_template, request, send_file
 import os
-from ruaccent import RUAccent
-import text_split
-app = Flask(__name__)
-ru_accent = RUAccent()
-ru_accent.load()
-@app.route('/')
-def index():
-    return render_template('index.html')
-@app.route('/process', methods=['POST'])
-def process():
-    if request.method == 'POST':
-        input_text = request.form['input_text']
-        processed_text = ru_accent.process_all(input_text)
-        # Create three text files with the same content
-        file_name = 'accented_text.txt'
-        with open(file_name, 'w', encoding="utf-8") as file:
-            file.write(" ".join(processed_text[0]))
-        file_name = 'omographs.txt'
-        with open(file_name, 'w', encoding="utf-8") as file:
-            file.write("\n".join(processed_text[1]))
-        file_name = 'unknown.txt'
-        with open(file_name, 'w', encoding="utf-8") as file:
-            file.write("\n".join(processed_text[2]))
-        return render_template('result.html')
-@app.route('/download/<file_name>')
-def download(file_name):
-    file_name = f'{file_name}'
-    return send_file(file_name, as_attachment=True, download_name=f'{file_name}')
-if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=7860)

+import json
 import os
+import re
+from os.path import join as join_path
+from text_split import split_by_sentences
+class RUAccent:
+    vowels = "аеёиоуыэюя"
+    def __init__(self):
+        self.omographs = None
+        self.accents = None
+        self.workdir = os.getcwd()
+    def load(self, custom_accent=None, custom_omographs=None):
+        if custom_omographs is None:
+            custom_omographs = {}
+        if custom_accent is None:
+            custom_accent = {}
+        self.omographs = json.load(open(join_path(self.workdir, "dictionaries", "file_omo.json"), encoding='utf-8'))
+        self.omographs.update(custom_omographs)
+        self.accents = json.load(open(join_path(self.workdir, "dictionaries", "file_norm.json"), encoding='utf-8'))
+        self.accents.update(custom_accent)
+        # self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')
+    def split_by_words(self, string):
+        result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
+        return [res for res in result if res]
+    def process_all(self, text):
+        """Ядро всей программы. Тут текст проходит через ряд функций,
+        где по итогу получается строка с проставленными ударениями
+        Input:
+        text: string
+        Output:
+        accented_sentence: list[string]
+        omographs_list: list[string]
+        unknown_list: list[string]
+        """
+        accented_sentence = []
+        omographs_list = []
+        unknown_list = []
+        sentences = split_by_sentences(text)
+        outputs = []
+        for sentence in sentences:
+            text = self.split_by_words(sentence)
+            founded_omographs = self._process_omographs(text)
+            omographs_list.extend(founded_omographs)
+            processed_text, unknown_words = self._process_accent(text, founded_omographs)
+            unknown_list.extend(unknown_words)
+            processed_text = " ".join(processed_text)
+            processed_text = self.delete_spaces_before_punc(processed_text)
+            accented_sentence.append(processed_text)
+        omographs_list = [f"{key}: {value}" for elem in omographs_list for key, value in elem.items()]
+        return accented_sentence, list(set(omographs_list)), list(set(unknown_list))
+    def _process_yo(self, text):
+        splitted_text = text
+        for i, word in enumerate(splitted_text):
+            splitted_text[i] = self.yo_words.get(word, word)
+        return splitted_text
+    def _process_omographs(self, text):
+        splitted_text = text
+        founded_omographs = []
+        for i, word in enumerate(splitted_text):
+            variants = self.omographs.get(word)
+            if variants:
+                founded_omographs.append(
+                    {word: self.omographs[word]["acc_variants"]}
+                )
+        return founded_omographs
+    def _process_accent(self, text, founded_omographs):
+        splitted_text = text
+        unknown_words = []
+        for i, word in enumerate(splitted_text):
+            stressed_word = self.accents.get(word, word)
+            if stressed_word in [list(d.keys())[0] for d in founded_omographs]:
+                splitted_text[i] = word
+            elif stressed_word != word:
+                splitted_text[i] = stressed_word["accent"]
+            else:
+                if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
+                    unknown_words.append(word)
+                splitted_text[i] = word
+        return splitted_text, unknown_words
+    def delete_spaces_before_punc(self, text):
+        punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
+        for char in punc:
+            text = text.replace(" " + char, char)
+        return text
+# # Example usage:
+# ru_accent = RUAccent()
+# ru_accent.load()
+#
+# text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига нига нига из-за этого сучонка"
+# processed_text = ru_accent.process_all(text_to_process)
+#
+# print(processed_text)

dictionaries/accents.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:adb807918505efc4f2707e6536f52951e2be3bc3f714a7285fecdc7434c7f7b8
-size 178733505

dictionaries/omographs.json DELETED Viewed

The diff for this file is too large to render. See raw diff

ruaccent.py DELETED Viewed

@@ -1,142 +0,0 @@
-import json
-import os
-import re
-from os.path import join as join_path
-from text_split import split_by_sentences
-class RUAccent:
-    vowels = "аеёиоуыэюя"
-    def __init__(self):
-        self.omographs = None
-        self.accents = None
-        self.workdir = os.getcwd()
-    def load(self, custom_accent=None, custom_omographs=None):
-        if custom_omographs is None:
-            custom_omographs = {}
-        if custom_accent is None:
-            custom_accent = {}
-        self.omographs = json.load(open(join_path(self.workdir, "dictionaries", "omographs.json"), encoding='utf-8'))
-        self.omographs.update(custom_omographs)
-        self.accents = json.load(open(join_path(self.workdir, "dictionaries", "accents.json"), encoding='utf-8'))
-        self.accents.update(custom_accent)
-        # self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')
-    def split_by_words(self, string):
-        result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
-        return [res for res in result if res]
-    def process_all(self, text):
-        """Ядро всей программы. Тут текст проходит через ряд функций,
-        где по итогу получается строка с проставленными ударениями
-        Input:
-        text: string
-        Output:
-        accented_sentence: list[string]
-        omographs_list: list[string]
-        unknown_list: list[string]
-        """
-        accented_sentence = []
-        omographs_list = []
-        unknown_list = []
-        sentences = split_by_sentences(text)
-        outputs = []
-        for sentence in sentences:
-            text = self.split_by_words(sentence)
-            # processed_text = self._process_yo(text)
-            # processed_text = self._process_omographs(text)
-            founded_omographs = self._process_omographs(text)
-            omographs_list.extend(founded_omographs)
-            processed_text, unknown_words = self._process_accent(text, founded_omographs)
-            unknown_list.extend(unknown_words)
-            processed_text = " ".join(processed_text)
-            processed_text = self.delete_spaces_before_punc(processed_text)
-            # outputs.append(processed_text)
-            accented_sentence.append(processed_text)
-            # " ".join(outputs)
-        omographs_list = [f"{key}: {value}" for elem in omographs_list for key, value in elem.items()]
-        return accented_sentence, omographs_list, unknown_list
-    def _process_yo(self, text):
-        splitted_text = text
-        for i, word in enumerate(splitted_text):
-            splitted_text[i] = self.yo_words.get(word, word)
-        return splitted_text
-    def _process_omographs(self, text):
-        splitted_text = text
-        founded_omographs = []
-        for i, word in enumerate(splitted_text):
-            variants = self.omographs.get(word)
-            if variants:
-                founded_omographs.append(
-                    {word: variants}
-                )
-        # for omograph in founded_omographs:
-        #     splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
-        #     cls = omograph["variants"][0]  # Just take the first variant from the dictionary
-        #     splitted_text[omograph["position"]] = cls
-        # return splitted_text
-        return founded_omographs
-    def _process_accent(self, text, founded_omographs):
-        splitted_text = text
-        unknown_words = []
-        for i, word in enumerate(splitted_text):
-            stressed_word = self.accents.get(word, word)
-            if stressed_word == word:
-                # if len(word) > 4:
-                if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
-                    unknown_words.append(word)
-                splitted_text[i] = word
-            elif stressed_word != word and word in [list(d.keys())[0] for d in founded_omographs]:
-                splitted_text[i] = word
-            else:
-                splitted_text[i] = stressed_word
-            # stressed_word = self.accents.get(word, word)
-            # splitted_text[i] = stressed_word
-        return splitted_text, unknown_words
-    def delete_spaces_before_punc(self, text):
-        punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
-        for char in punc:
-            text = text.replace(" " + char, char)
-        return text
-# # Example usage:
-# ru_accent = RUAccent()
-# ru_accent.load()
-#
-# text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига из-за этого сучонка"
-# processed_text = ru_accent.process_all(text_to_process)
-#
-# print(processed_text)