shakhovak commited on
Commit
5af977b
1 Parent(s): b54d798
Files changed (4) hide show
  1. app.py +112 -31
  2. dictionaries/accents.json +0 -3
  3. dictionaries/omographs.json +0 -0
  4. ruaccent.py +0 -142
app.py CHANGED
@@ -1,44 +1,125 @@
1
- from flask import Flask, render_template, request, send_file
2
  import os
3
- from ruaccent import RUAccent
4
- import text_split
5
 
6
- app = Flask(__name__)
7
 
8
- ru_accent = RUAccent()
9
- ru_accent.load()
10
 
11
- @app.route('/')
12
- def index():
13
- return render_template('index.html')
14
 
15
- @app.route('/process', methods=['POST'])
16
- def process():
17
- if request.method == 'POST':
18
- input_text = request.form['input_text']
19
- processed_text = ru_accent.process_all(input_text)
20
 
21
- # Create three text files with the same content
22
 
23
- file_name = 'accented_text.txt'
24
- with open(file_name, 'w', encoding="utf-8") as file:
25
- file.write(" ".join(processed_text[0]))
26
 
27
- file_name = 'omographs.txt'
28
- with open(file_name, 'w', encoding="utf-8") as file:
29
- file.write("\n".join(processed_text[1]))
30
 
31
- file_name = 'unknown.txt'
32
- with open(file_name, 'w', encoding="utf-8") as file:
33
- file.write("\n".join(processed_text[2]))
34
 
 
35
 
36
- return render_template('result.html')
37
 
38
- @app.route('/download/<file_name>')
39
- def download(file_name):
40
- file_name = f'{file_name}'
41
- return send_file(file_name, as_attachment=True, download_name=f'{file_name}')
42
 
43
- if __name__ == "__main__":
44
- app.run(host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
  import os
3
+ import re
4
+ from os.path import join as join_path
5
 
6
+ from text_split import split_by_sentences
7
 
 
 
8
 
9
+ class RUAccent:
10
+ vowels = "аеёиоуыэюя"
 
11
 
12
+ def __init__(self):
13
+ self.omographs = None
14
+ self.accents = None
15
+ self.workdir = os.getcwd()
 
16
 
17
+ def load(self, custom_accent=None, custom_omographs=None):
18
 
19
+ if custom_omographs is None:
20
+ custom_omographs = {}
 
21
 
22
+ if custom_accent is None:
23
+ custom_accent = {}
 
24
 
25
+ self.omographs = json.load(open(join_path(self.workdir, "dictionaries", "file_omo.json"), encoding='utf-8'))
 
 
26
 
27
+ self.omographs.update(custom_omographs)
28
 
29
+ self.accents = json.load(open(join_path(self.workdir, "dictionaries", "file_norm.json"), encoding='utf-8'))
30
 
31
+ self.accents.update(custom_accent)
 
 
 
32
 
33
+ # self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')
34
+
35
+ def split_by_words(self, string):
36
+ result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
37
+ return [res for res in result if res]
38
+
39
+ def process_all(self, text):
40
+ """Ядро всей программы. Тут текст проходит через ряд функций,
41
+ где по итогу получается строка с проставленными ударениями
42
+ Input:
43
+ text: string
44
+
45
+ Output:
46
+ accented_sentence: list[string]
47
+ omographs_list: list[string]
48
+ unknown_list: list[string]
49
+ """
50
+ accented_sentence = []
51
+ omographs_list = []
52
+ unknown_list = []
53
+
54
+ sentences = split_by_sentences(text)
55
+ outputs = []
56
+ for sentence in sentences:
57
+ text = self.split_by_words(sentence)
58
+
59
+ founded_omographs = self._process_omographs(text)
60
+ omographs_list.extend(founded_omographs)
61
+
62
+ processed_text, unknown_words = self._process_accent(text, founded_omographs)
63
+ unknown_list.extend(unknown_words)
64
+
65
+ processed_text = " ".join(processed_text)
66
+ processed_text = self.delete_spaces_before_punc(processed_text)
67
+
68
+ accented_sentence.append(processed_text)
69
+
70
+ omographs_list = [f"{key}: {value}" for elem in omographs_list for key, value in elem.items()]
71
+ return accented_sentence, list(set(omographs_list)), list(set(unknown_list))
72
+
73
+ def _process_yo(self, text):
74
+ splitted_text = text
75
+
76
+ for i, word in enumerate(splitted_text):
77
+ splitted_text[i] = self.yo_words.get(word, word)
78
+ return splitted_text
79
+
80
+ def _process_omographs(self, text):
81
+ splitted_text = text
82
+
83
+ founded_omographs = []
84
+ for i, word in enumerate(splitted_text):
85
+ variants = self.omographs.get(word)
86
+ if variants:
87
+ founded_omographs.append(
88
+ {word: self.omographs[word]["acc_variants"]}
89
+ )
90
+
91
+ return founded_omographs
92
+
93
+ def _process_accent(self, text, founded_omographs):
94
+ splitted_text = text
95
+ unknown_words = []
96
+ for i, word in enumerate(splitted_text):
97
+ stressed_word = self.accents.get(word, word)
98
+
99
+ if stressed_word in [list(d.keys())[0] for d in founded_omographs]:
100
+ splitted_text[i] = word
101
+
102
+ elif stressed_word != word:
103
+ splitted_text[i] = stressed_word["accent"]
104
+
105
+ else:
106
+ if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
107
+ unknown_words.append(word)
108
+ splitted_text[i] = word
109
+
110
+ return splitted_text, unknown_words
111
+
112
+ def delete_spaces_before_punc(self, text):
113
+ punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
114
+ for char in punc:
115
+ text = text.replace(" " + char, char)
116
+ return text
117
+
118
+ # # Example usage:
119
+ # ru_accent = RUAccent()
120
+ # ru_accent.load()
121
+ #
122
+ # text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига нига нига из-за этого сучонка"
123
+ # processed_text = ru_accent.process_all(text_to_process)
124
+ #
125
+ # print(processed_text)
dictionaries/accents.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:adb807918505efc4f2707e6536f52951e2be3bc3f714a7285fecdc7434c7f7b8
3
- size 178733505
 
 
 
 
dictionaries/omographs.json DELETED
The diff for this file is too large to render. See raw diff
 
ruaccent.py DELETED
@@ -1,142 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from os.path import join as join_path
5
-
6
- from text_split import split_by_sentences
7
-
8
-
9
- class RUAccent:
10
- vowels = "аеёиоуыэюя"
11
- def __init__(self):
12
- self.omographs = None
13
- self.accents = None
14
- self.workdir = os.getcwd()
15
-
16
-
17
- def load(self, custom_accent=None, custom_omographs=None):
18
-
19
- if custom_omographs is None:
20
- custom_omographs = {}
21
-
22
- if custom_accent is None:
23
- custom_accent = {}
24
-
25
- self.omographs = json.load(open(join_path(self.workdir, "dictionaries", "omographs.json"), encoding='utf-8'))
26
-
27
- self.omographs.update(custom_omographs)
28
-
29
- self.accents = json.load(open(join_path(self.workdir, "dictionaries", "accents.json"), encoding='utf-8'))
30
-
31
- self.accents.update(custom_accent)
32
-
33
- # self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')
34
-
35
- def split_by_words(self, string):
36
- result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
37
- return [res for res in result if res]
38
-
39
- def process_all(self, text):
40
- """Ядро всей программы. Тут текст проходит через ряд функций,
41
- где по итогу получается строка с проставленными ударениями
42
- Input:
43
- text: string
44
-
45
- Output:
46
- accented_sentence: list[string]
47
- omographs_list: list[string]
48
- unknown_list: list[string]
49
- """
50
- accented_sentence = []
51
- omographs_list = []
52
- unknown_list = []
53
-
54
- sentences = split_by_sentences(text)
55
- outputs = []
56
- for sentence in sentences:
57
- text = self.split_by_words(sentence)
58
- # processed_text = self._process_yo(text)
59
-
60
- # processed_text = self._process_omographs(text)
61
- founded_omographs = self._process_omographs(text)
62
- omographs_list.extend(founded_omographs)
63
-
64
- processed_text, unknown_words = self._process_accent(text, founded_omographs)
65
- unknown_list.extend(unknown_words)
66
-
67
- processed_text = " ".join(processed_text)
68
- processed_text = self.delete_spaces_before_punc(processed_text)
69
- # outputs.append(processed_text)
70
-
71
- accented_sentence.append(processed_text)
72
- # " ".join(outputs)
73
-
74
- omographs_list = [f"{key}: {value}" for elem in omographs_list for key, value in elem.items()]
75
- return accented_sentence, omographs_list, unknown_list
76
-
77
- def _process_yo(self, text):
78
- splitted_text = text
79
-
80
- for i, word in enumerate(splitted_text):
81
- splitted_text[i] = self.yo_words.get(word, word)
82
- return splitted_text
83
-
84
- def _process_omographs(self, text):
85
- splitted_text = text
86
-
87
- founded_omographs = []
88
- for i, word in enumerate(splitted_text):
89
- variants = self.omographs.get(word)
90
- if variants:
91
- founded_omographs.append(
92
- {word: variants}
93
- )
94
-
95
-
96
- # for omograph in founded_omographs:
97
- # splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
98
- # cls = omograph["variants"][0] # Just take the first variant from the dictionary
99
- # splitted_text[omograph["position"]] = cls
100
- # return splitted_text
101
- return founded_omographs
102
-
103
- def _process_accent(self, text, founded_omographs):
104
- splitted_text = text
105
- unknown_words = []
106
- for i, word in enumerate(splitted_text):
107
- stressed_word = self.accents.get(word, word)
108
- if stressed_word == word:
109
- # if len(word) > 4:
110
- if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
111
- unknown_words.append(word)
112
- splitted_text[i] = word
113
-
114
- elif stressed_word != word and word in [list(d.keys())[0] for d in founded_omographs]:
115
- splitted_text[i] = word
116
-
117
- else:
118
- splitted_text[i] = stressed_word
119
-
120
-
121
-
122
-
123
- # stressed_word = self.accents.get(word, word)
124
- # splitted_text[i] = stressed_word
125
-
126
- return splitted_text, unknown_words
127
-
128
- def delete_spaces_before_punc(self, text):
129
- punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
130
- for char in punc:
131
- text = text.replace(" " + char, char)
132
- return text
133
-
134
-
135
- # # Example usage:
136
- # ru_accent = RUAccent()
137
- # ru_accent.load()
138
- #
139
- # text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига из-за этого сучонка"
140
- # processed_text = ru_accent.process_all(text_to_process)
141
- #
142
- # print(processed_text)