shakhovak commited on
Commit
09cf842
1 Parent(s): 18cd63e

added files

Browse files
Files changed (5) hide show
  1. Dockerfile +11 -0
  2. requirements.txt +11 -0
  3. ruaccent.py +142 -0
  4. text_split.py +134 -0
  5. web_interface.py +44 -0
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-alpine
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ COPY . .
10
+
11
+ CMD ["gunicorn", "-b", "0.0.0.0:7860", "code.web_interface:app"]
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ blinker==1.7.0
2
+ click==8.1.7
3
+ colorama==0.4.6
4
+ Flask==3.0.0
5
+ importlib-metadata==7.0.0
6
+ itsdangerous==2.1.2
7
+ Jinja2==3.1.2
8
+ MarkupSafe==2.1.3
9
+ Werkzeug==3.0.1
10
+ zipp==3.17.0
11
+ gunicorn==20.1.0
ruaccent.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from os.path import join as join_path
5
+
6
+ from text_split import split_by_sentences
7
+
8
+
9
+ class RUAccent:
10
+ vowels = "аеёиоуыэюя"
11
+ def __init__(self):
12
+ self.omographs = None
13
+ self.accents = None
14
+ self.workdir = os.getcwd()
15
+
16
+
17
+ def load(self, custom_accent=None, custom_omographs=None):
18
+
19
+ if custom_omographs is None:
20
+ custom_omographs = {}
21
+
22
+ if custom_accent is None:
23
+ custom_accent = {}
24
+
25
+ self.omographs = json.load(open(join_path(self.workdir, "dictionaries", "omographs.json"), encoding='utf-8'))
26
+
27
+ self.omographs.update(custom_omographs)
28
+
29
+ self.accents = json.load(open(join_path(self.workdir, "dictionaries", "accents.json"), encoding='utf-8'))
30
+
31
+ self.accents.update(custom_accent)
32
+
33
+ # self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')
34
+
35
+ def split_by_words(self, string):
36
+ result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
37
+ return [res for res in result if res]
38
+
39
+ def process_all(self, text):
40
+ """Ядро всей программы. Тут текст проходит через ряд функций,
41
+ где по итогу получается строка с проставленными ударениями
42
+ Input:
43
+ text: string
44
+
45
+ Output:
46
+ accented_sentence: list[string]
47
+ omographs_list: list[string]
48
+ unknown_list: list[string]
49
+ """
50
+ accented_sentence = []
51
+ omographs_list = []
52
+ unknown_list = []
53
+
54
+ sentences = split_by_sentences(text)
55
+ outputs = []
56
+ for sentence in sentences:
57
+ text = self.split_by_words(sentence)
58
+ # processed_text = self._process_yo(text)
59
+
60
+ # processed_text = self._process_omographs(text)
61
+ founded_omographs = self._process_omographs(text)
62
+ omographs_list.extend(founded_omographs)
63
+
64
+ processed_text, unknown_words = self._process_accent(text, founded_omographs)
65
+ unknown_list.extend(unknown_words)
66
+
67
+ processed_text = " ".join(processed_text)
68
+ processed_text = self.delete_spaces_before_punc(processed_text)
69
+ # outputs.append(processed_text)
70
+
71
+ accented_sentence.append(processed_text)
72
+ # " ".join(outputs)
73
+
74
+ omographs_list = [f"{key}: {value}" for elem in omographs_list for key, value in elem.items()]
75
+ return accented_sentence, omographs_list, unknown_list
76
+
77
+ def _process_yo(self, text):
78
+ splitted_text = text
79
+
80
+ for i, word in enumerate(splitted_text):
81
+ splitted_text[i] = self.yo_words.get(word, word)
82
+ return splitted_text
83
+
84
+ def _process_omographs(self, text):
85
+ splitted_text = text
86
+
87
+ founded_omographs = []
88
+ for i, word in enumerate(splitted_text):
89
+ variants = self.omographs.get(word)
90
+ if variants:
91
+ founded_omographs.append(
92
+ {word: variants}
93
+ )
94
+
95
+
96
+ # for omograph in founded_omographs:
97
+ # splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
98
+ # cls = omograph["variants"][0] # Just take the first variant from the dictionary
99
+ # splitted_text[omograph["position"]] = cls
100
+ # return splitted_text
101
+ return founded_omographs
102
+
103
+ def _process_accent(self, text, founded_omographs):
104
+ splitted_text = text
105
+ unknown_words = []
106
+ for i, word in enumerate(splitted_text):
107
+ stressed_word = self.accents.get(word, word)
108
+ if stressed_word == word:
109
+ # if len(word) > 4:
110
+ if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
111
+ unknown_words.append(word)
112
+ splitted_text[i] = word
113
+
114
+ elif stressed_word != word and word in [list(d.keys())[0] for d in founded_omographs]:
115
+ splitted_text[i] = word
116
+
117
+ else:
118
+ splitted_text[i] = stressed_word
119
+
120
+
121
+
122
+
123
+ # stressed_word = self.accents.get(word, word)
124
+ # splitted_text[i] = stressed_word
125
+
126
+ return splitted_text, unknown_words
127
+
128
+ def delete_spaces_before_punc(self, text):
129
+ punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
130
+ for char in punc:
131
+ text = text.replace(" " + char, char)
132
+ return text
133
+
134
+
135
+ # # Example usage:
136
+ # ru_accent = RUAccent()
137
+ # ru_accent.load()
138
+ #
139
+ # text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига из-за этого сучонка"
140
+ # processed_text = ru_accent.process_all(text_to_process)
141
+ #
142
+ # print(processed_text)
text_split.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+ from typing import Set, Tuple, List
4
+
5
+ SENTENCE_SPLITTER = re.compile(r'[^\.?!…]+[\.?!…]*["»“]*')
6
+
7
+ LAST_WORD_PATTERN = re.compile(r'(?:\b|\d)([a-zа-я]+)\.$', re.IGNORECASE)
8
+ FIRST_WORD_PATTERN = re.compile(r'^\W*(\w+)')
9
+ ENDS_WITH_ONE_LETTER_LAT_AND_DOT_PATTERN = re.compile(r'(\d|\W|\b)([a-zA-Z])\.$')
10
+ HAS_DOT_INSIDE_PATTERN = re.compile(r'[\w]+\.[\w]+\.$', re.IGNORECASE)
11
+ INITIALS_PATTERN = re.compile(r'(\W|\b)([A-ZА-Я]{1})\.$')
12
+ ONLY_RUS_CONSONANTS_PATTERN = re.compile(r'^[бвгджзйклмнпрстфхцчшщ]{1,4}$', re.IGNORECASE)
13
+ STARTS_WITH_EMPTYNESS_PATTERN = re.compile(r'^\s+')
14
+ ENDS_WITH_EMOTION_PATTERN = re.compile(r'[!?…]|\.{2,}\s?[)"«»,“]?$')
15
+ STARTS_WITH_LOWER_PATTERN = re.compile(r'^\s*[–-—-("«]?\s*[a-zа-я]')
16
+ STARTS_WITH_DIGIT_PATTERN = re.compile(r'^\s*\d')
17
+ NUMERATION_PATTERN = re.compile(r'^\W*[IVXMCL\d]+\.$')
18
+ PAIRED_SHORTENING_IN_THE_END_PATTERN = re.compile(r'\b(\w+)\. (\w+)\.\W*$')
19
+
20
+ JOIN = 0
21
+ MAYBE = 1
22
+ SPLIT = 2
23
+
24
+ JOINING_SHORTENINGS = {
25
+ 'mr', 'mrs', 'ms', 'dr', 'vs', 'англ', 'итал', 'греч', 'евр', 'араб', 'яп', 'слав', 'кит',
26
+ 'тел', 'св', 'ул', 'устар', 'им', 'г', 'см', 'д', 'стр', 'корп', 'пл', 'пер', 'сокр', 'рис'
27
+ }
28
+
29
+ SHORTENINGS = {
30
+ 'co', 'corp', 'inc', 'авт', 'адм', 'барр', 'внутр', 'га', 'дифф', 'дол', 'долл', 'зав', 'зам', 'искл',
31
+ 'коп', 'корп', 'куб', 'лат', 'мин', 'о', 'обл', 'обр', 'прим', 'проц', 'р', 'ред', 'руб', 'рус', 'русск',
32
+ 'сан', 'сек', 'тыс', 'эт', 'яз', 'гос', 'мн', 'жен', 'муж', 'накл', 'повел', 'букв', 'шутл', 'ед'
33
+ }
34
+
35
+ PAIRED_SHORTENINGS = {('и', 'о'), ('т', 'е'), ('т', 'п'), ('у', 'е'), ('н', 'э')}
36
+
37
+
38
+ def split_sentences(text: str) -> List[str]:
39
+ return [x.strip() for x in SENTENCE_SPLITTER.findall(text)]
40
+
41
+
42
+ def is_sentence_end(left: str, right: str,
43
+ shortenings: Set[str],
44
+ joining_shortenings: Set[str],
45
+ paired_shortenings: Set[Tuple[str, str]]) -> int:
46
+ if not STARTS_WITH_EMPTYNESS_PATTERN.match(right):
47
+ return JOIN
48
+
49
+ if HAS_DOT_INSIDE_PATTERN.search(left):
50
+ return JOIN
51
+
52
+ left_last_word = LAST_WORD_PATTERN.search(left)
53
+ lw = ' '
54
+ if left_last_word:
55
+ lw = left_last_word.group(1)
56
+
57
+ if lw.lower() in joining_shortenings:
58
+ return JOIN
59
+
60
+ if ONLY_RUS_CONSONANTS_PATTERN.search(lw) and lw[-1].islower():
61
+ return MAYBE
62
+
63
+ pse = PAIRED_SHORTENING_IN_THE_END_PATTERN.search(left)
64
+ if pse:
65
+ s1, s2 = pse.groups()
66
+ if (s1, s2) in paired_shortenings:
67
+ return MAYBE
68
+
69
+ right_first_word = FIRST_WORD_PATTERN.match(right)
70
+ if right_first_word:
71
+ rw = right_first_word.group(1)
72
+ if (lw, rw) in paired_shortenings:
73
+ return MAYBE
74
+
75
+ if ENDS_WITH_EMOTION_PATTERN.search(left) and STARTS_WITH_LOWER_PATTERN.match(right):
76
+ return JOIN
77
+
78
+ initials = INITIALS_PATTERN.search(left)
79
+ if initials:
80
+ border, _ = initials.groups()
81
+ if (border or ' ') not in "°'":
82
+ return JOIN
83
+
84
+ if lw.lower() in shortenings:
85
+ return MAYBE
86
+
87
+ last_letter = ENDS_WITH_ONE_LETTER_LAT_AND_DOT_PATTERN.search(left)
88
+ if last_letter:
89
+ border, _ = last_letter.groups()
90
+ if (border or ' ') not in "°'":
91
+ return MAYBE
92
+ if NUMERATION_PATTERN.match(left):
93
+ return JOIN
94
+ return SPLIT
95
+
96
+
97
+ def split_by_sentences(text: str,
98
+ shortenings: Set[str] = SHORTENINGS,
99
+ joining_shortenings: Set[str] = JOINING_SHORTENINGS,
100
+ paired_shortenings: Set[Tuple[str, str]] = PAIRED_SHORTENINGS) -> List[str]:
101
+ sentences = []
102
+ sents = split_sentences(text)
103
+ si = 0
104
+ processed_index = 0
105
+ sent_start = 0
106
+ while si < len(sents):
107
+ s = sents[si]
108
+ span_start = text[processed_index:].index(s) + processed_index
109
+ span_end = span_start + len(s)
110
+ processed_index += len(s)
111
+
112
+ si += 1
113
+
114
+ send = is_sentence_end(text[sent_start: span_end], text[span_end:],
115
+ shortenings, joining_shortenings, paired_shortenings)
116
+ if send == JOIN:
117
+ continue
118
+
119
+ if send == MAYBE:
120
+ if STARTS_WITH_LOWER_PATTERN.match(text[span_end:]):
121
+ continue
122
+ if STARTS_WITH_DIGIT_PATTERN.match(text[span_end:]):
123
+ continue
124
+
125
+ if not text[sent_start: span_end].strip():
126
+ print(text)
127
+ sentences.append(text[sent_start: span_end].strip())
128
+ sent_start = span_end
129
+ processed_index = span_end
130
+
131
+ if sent_start != len(text):
132
+ if text[sent_start:].strip():
133
+ sentences.append(text[sent_start:].strip())
134
+ return sentences
web_interface.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request, send_file
2
+ import os
3
+ from ruaccent import RUAccent
4
+ import text_split
5
+
6
+ app = Flask(__name__)
7
+
8
+ ru_accent = RUAccent()
9
+ ru_accent.load()
10
+
11
+ @app.route('/')
12
+ def index():
13
+ return render_template('index.html')
14
+
15
+ @app.route('/process', methods=['POST'])
16
+ def process():
17
+ if request.method == 'POST':
18
+ input_text = request.form['input_text']
19
+ processed_text = ru_accent.process_all(input_text)
20
+
21
+ # Create three text files with the same content
22
+
23
+ file_name = 'accented_text.txt'
24
+ with open(file_name, 'w', encoding="utf-8") as file:
25
+ file.write(" ".join(processed_text[0]))
26
+
27
+ file_name = 'omographs.txt'
28
+ with open(file_name, 'w', encoding="utf-8") as file:
29
+ file.write("\n".join(processed_text[1]))
30
+
31
+ file_name = 'unknown.txt'
32
+ with open(file_name, 'w', encoding="utf-8") as file:
33
+ file.write("\n".join(processed_text[2]))
34
+
35
+
36
+ return render_template('result.html')
37
+
38
+ @app.route('/download/<file_name>')
39
+ def download(file_name):
40
+ file_name = f'{file_name}'
41
+ return send_file(file_name, as_attachment=True, download_name=f'{file_name}')
42
+
43
+ if __name__ == '__main__':
44
+ #app.run(debug=True, port=5001)