Spaces:

Shakhovak
/

RU_accent_flask

Sleeping

App Files Files Community

shakhovak commited on Dec 17, 2023

Commit

09cf842

•

1 Parent(s): 18cd63e

added files

Browse files

Files changed (5) hide show

Dockerfile +11 -0
requirements.txt +11 -0
ruaccent.py +142 -0
text_split.py +134 -0
web_interface.py +44 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.9-alpine
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["gunicorn", "-b", "0.0.0.0:7860", "code.web_interface:app"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+blinker==1.7.0
+click==8.1.7
+colorama==0.4.6
+Flask==3.0.0
+importlib-metadata==7.0.0
+itsdangerous==2.1.2
+Jinja2==3.1.2
+MarkupSafe==2.1.3
+Werkzeug==3.0.1
+zipp==3.17.0
+gunicorn==20.1.0

ruaccent.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import json
+import os
+import re
+from os.path import join as join_path
+from text_split import split_by_sentences
+class RUAccent:
+    vowels = "аеёиоуыэюя"
+    def __init__(self):
+        self.omographs = None
+        self.accents = None
+        self.workdir = os.getcwd()
+    def load(self, custom_accent=None, custom_omographs=None):
+        if custom_omographs is None:
+            custom_omographs = {}
+        if custom_accent is None:
+            custom_accent = {}
+        self.omographs = json.load(open(join_path(self.workdir, "dictionaries", "omographs.json"), encoding='utf-8'))
+        self.omographs.update(custom_omographs)
+        self.accents = json.load(open(join_path(self.workdir, "dictionaries", "accents.json"), encoding='utf-8'))
+        self.accents.update(custom_accent)
+        # self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')
+    def split_by_words(self, string):
+        result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
+        return [res for res in result if res]
+    def process_all(self, text):
+        """Ядро всей программы. Тут текст проходит через ряд функций,
+        где по итогу получается строка с проставленными ударениями
+        Input:
+        text: string
+        Output:
+        accented_sentence: list[string]
+        omographs_list: list[string]
+        unknown_list: list[string]
+        """
+        accented_sentence = []
+        omographs_list = []
+        unknown_list = []
+        sentences = split_by_sentences(text)
+        outputs = []
+        for sentence in sentences:
+            text = self.split_by_words(sentence)
+            # processed_text = self._process_yo(text)
+            # processed_text = self._process_omographs(text)
+            founded_omographs = self._process_omographs(text)
+            omographs_list.extend(founded_omographs)
+            processed_text, unknown_words = self._process_accent(text, founded_omographs)
+            unknown_list.extend(unknown_words)
+            processed_text = " ".join(processed_text)
+            processed_text = self.delete_spaces_before_punc(processed_text)
+            # outputs.append(processed_text)
+            accented_sentence.append(processed_text)
+            # " ".join(outputs)
+        omographs_list = [f"{key}: {value}" for elem in omographs_list for key, value in elem.items()]
+        return accented_sentence, omographs_list, unknown_list
+    def _process_yo(self, text):
+        splitted_text = text
+        for i, word in enumerate(splitted_text):
+            splitted_text[i] = self.yo_words.get(word, word)
+        return splitted_text
+    def _process_omographs(self, text):
+        splitted_text = text
+        founded_omographs = []
+        for i, word in enumerate(splitted_text):
+            variants = self.omographs.get(word)
+            if variants:
+                founded_omographs.append(
+                    {word: variants}
+                )
+        # for omograph in founded_omographs:
+        #     splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
+        #     cls = omograph["variants"][0]  # Just take the first variant from the dictionary
+        #     splitted_text[omograph["position"]] = cls
+        # return splitted_text
+        return founded_omographs
+    def _process_accent(self, text, founded_omographs):
+        splitted_text = text
+        unknown_words = []
+        for i, word in enumerate(splitted_text):
+            stressed_word = self.accents.get(word, word)
+            if stressed_word == word:
+                # if len(word) > 4:
+                if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
+                    unknown_words.append(word)
+                splitted_text[i] = word
+            elif stressed_word != word and word in [list(d.keys())[0] for d in founded_omographs]:
+                splitted_text[i] = word
+            else:
+                splitted_text[i] = stressed_word
+            # stressed_word = self.accents.get(word, word)
+            # splitted_text[i] = stressed_word
+        return splitted_text, unknown_words
+    def delete_spaces_before_punc(self, text):
+        punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
+        for char in punc:
+            text = text.replace(" " + char, char)
+        return text
+# # Example usage:
+# ru_accent = RUAccent()
+# ru_accent.load()
+#
+# text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига из-за этого сучонка"
+# processed_text = ru_accent.process_all(text_to_process)
+#
+# print(processed_text)

text_split.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import re
+import logging
+from typing import Set, Tuple, List
+SENTENCE_SPLITTER = re.compile(r'[^\.?!…]+[\.?!…]*["»“]*')
+LAST_WORD_PATTERN = re.compile(r'(?:\b|\d)([a-zа-я]+)\.$', re.IGNORECASE)
+FIRST_WORD_PATTERN = re.compile(r'^\W*(\w+)')
+ENDS_WITH_ONE_LETTER_LAT_AND_DOT_PATTERN = re.compile(r'(\d|\W|\b)([a-zA-Z])\.$')
+HAS_DOT_INSIDE_PATTERN = re.compile(r'[\w]+\.[\w]+\.$', re.IGNORECASE)
+INITIALS_PATTERN = re.compile(r'(\W|\b)([A-ZА-Я]{1})\.$')
+ONLY_RUS_CONSONANTS_PATTERN = re.compile(r'^[бвгджзйклмнпрстфхцчшщ]{1,4}$', re.IGNORECASE)
+STARTS_WITH_EMPTYNESS_PATTERN = re.compile(r'^\s+')
+ENDS_WITH_EMOTION_PATTERN = re.compile(r'[!?…]|\.{2,}\s?[)"«»,“]?$')
+STARTS_WITH_LOWER_PATTERN = re.compile(r'^\s*[–-—-("«]?\s*[a-zа-я]')
+STARTS_WITH_DIGIT_PATTERN = re.compile(r'^\s*\d')
+NUMERATION_PATTERN = re.compile(r'^\W*[IVXMCL\d]+\.$')
+PAIRED_SHORTENING_IN_THE_END_PATTERN = re.compile(r'\b(\w+)\. (\w+)\.\W*$')
+JOIN = 0
+MAYBE = 1
+SPLIT = 2
+JOINING_SHORTENINGS = {
+    'mr', 'mrs', 'ms', 'dr', 'vs', 'англ', 'итал', 'греч', 'евр', 'араб', 'яп', 'слав', 'кит',
+    'тел', 'св', 'ул', 'устар', 'им', 'г', 'см', 'д', 'стр', 'корп', 'пл', 'пер', 'сокр', 'рис'
+}
+SHORTENINGS = {
+    'co', 'corp', 'inc', 'авт', 'адм', 'барр', 'внутр', 'га', 'дифф', 'дол', 'долл', 'зав', 'зам', 'искл',
+    'коп', 'корп', 'куб', 'лат', 'мин', 'о', 'обл', 'обр', 'прим', 'проц', 'р', 'ред', 'руб', 'рус', 'русск',
+    'сан', 'сек', 'тыс', 'эт', 'яз', 'гос', 'мн', 'жен', 'муж', 'накл', 'повел', 'букв', 'шутл', 'ед'
+}
+PAIRED_SHORTENINGS = {('и', 'о'), ('т', 'е'), ('т', 'п'), ('у', 'е'), ('н', 'э')}
+def split_sentences(text: str) -> List[str]:
+    return [x.strip() for x in SENTENCE_SPLITTER.findall(text)]
+def is_sentence_end(left: str, right: str,
+                    shortenings: Set[str],
+                    joining_shortenings: Set[str],
+                    paired_shortenings: Set[Tuple[str, str]]) -> int:
+    if not STARTS_WITH_EMPTYNESS_PATTERN.match(right):
+        return JOIN
+    if HAS_DOT_INSIDE_PATTERN.search(left):
+        return JOIN
+    left_last_word = LAST_WORD_PATTERN.search(left)
+    lw = ' '
+    if left_last_word:
+        lw = left_last_word.group(1)
+        if lw.lower() in joining_shortenings:
+            return JOIN
+        if ONLY_RUS_CONSONANTS_PATTERN.search(lw) and lw[-1].islower():
+            return MAYBE
+    pse = PAIRED_SHORTENING_IN_THE_END_PATTERN.search(left)
+    if pse:
+        s1, s2 = pse.groups()
+        if (s1, s2) in paired_shortenings:
+            return MAYBE
+    right_first_word = FIRST_WORD_PATTERN.match(right)
+    if right_first_word:
+        rw = right_first_word.group(1)
+        if (lw, rw) in paired_shortenings:
+            return MAYBE
+    if ENDS_WITH_EMOTION_PATTERN.search(left) and STARTS_WITH_LOWER_PATTERN.match(right):
+        return JOIN
+    initials = INITIALS_PATTERN.search(left)
+    if initials:
+        border, _ = initials.groups()
+        if (border or ' ') not in "°'":
+            return JOIN
+    if lw.lower() in shortenings:
+        return MAYBE
+    last_letter = ENDS_WITH_ONE_LETTER_LAT_AND_DOT_PATTERN.search(left)
+    if last_letter:
+        border, _ = last_letter.groups()
+        if (border or ' ') not in "°'":
+            return MAYBE
+    if NUMERATION_PATTERN.match(left):
+        return JOIN
+    return SPLIT
+def split_by_sentences(text: str,
+                      shortenings: Set[str] = SHORTENINGS,
+                      joining_shortenings: Set[str] = JOINING_SHORTENINGS,
+                      paired_shortenings: Set[Tuple[str, str]] = PAIRED_SHORTENINGS) -> List[str]:
+    sentences = []
+    sents = split_sentences(text)
+    si = 0
+    processed_index = 0
+    sent_start = 0
+    while si < len(sents):
+        s = sents[si]
+        span_start = text[processed_index:].index(s) + processed_index
+        span_end = span_start + len(s)
+        processed_index += len(s)
+        si += 1
+        send = is_sentence_end(text[sent_start: span_end], text[span_end:],
+                               shortenings, joining_shortenings, paired_shortenings)
+        if send == JOIN:
+            continue
+        if send == MAYBE:
+            if STARTS_WITH_LOWER_PATTERN.match(text[span_end:]):
+                continue
+            if STARTS_WITH_DIGIT_PATTERN.match(text[span_end:]):
+                continue
+        if not text[sent_start: span_end].strip():
+            print(text)
+        sentences.append(text[sent_start: span_end].strip())
+        sent_start = span_end
+        processed_index = span_end
+    if sent_start != len(text):
+        if text[sent_start:].strip():
+            sentences.append(text[sent_start:].strip())
+    return sentences

web_interface.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from flask import Flask, render_template, request, send_file
+import os
+from ruaccent import RUAccent
+import text_split
+app = Flask(__name__)
+ru_accent = RUAccent()
+ru_accent.load()
+@app.route('/')
+def index():
+    return render_template('index.html')
+@app.route('/process', methods=['POST'])
+def process():
+    if request.method == 'POST':
+        input_text = request.form['input_text']
+        processed_text = ru_accent.process_all(input_text)
+        # Create three text files with the same content
+        file_name = 'accented_text.txt'
+        with open(file_name, 'w', encoding="utf-8") as file:
+            file.write(" ".join(processed_text[0]))
+        file_name = 'omographs.txt'
+        with open(file_name, 'w', encoding="utf-8") as file:
+            file.write("\n".join(processed_text[1]))
+        file_name = 'unknown.txt'
+        with open(file_name, 'w', encoding="utf-8") as file:
+            file.write("\n".join(processed_text[2]))
+        return render_template('result.html')
+@app.route('/download/<file_name>')
+def download(file_name):
+    file_name = f'{file_name}'
+    return send_file(file_name, as_attachment=True, download_name=f'{file_name}')
+if __name__ == '__main__':
+    #app.run(debug=True, port=5001)