SoooSlooow commited on
Commit
d1ef404
·
1 Parent(s): a44a953

upload src

Browse files
app.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+
4
+ from src.data.preprocessing_utils import DataPreprocessor
5
+
6
+ MODEL_FILEPATH = 'models/nnet/nnet.pt'
7
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
8
+ with open(MODEL_FILEPATH, 'rb') as file:
9
+ clf = torch.load(file, map_location=device)
10
+
11
+ preprocessor = DataPreprocessor()
12
+
13
+ strings = {
14
+ 'nationality': 'Есть предпочтения по национальности',
15
+ 'families': 'Есть предпочтение семьям',
16
+ 'sex': 'Есть предпочтения по полу'
17
+ }
18
+
19
+ examples = [
20
+ '''Просьба посредников не беспокоить. Ищем ОДНУ ДЕВУШКУ.
21
+ Сдаётся в аренду на длительный срок светлая и уютная квартира - студия
22
+ общей площадью 33м2, находящаяся на 4м этаже 5и этажного теплого кирпичного дома. Современный ремонт!
23
+ Рядом в пешей доступности парк Красная Пресня (5 мин)/ Красногвардейские Пруды (2 мин)/
24
+ Москва-Сити (10 мин)! Магазины/кофейни/рестораны! 10 мин на машине до любой точки в центре города!
25
+ В квартире есть вся необходимая для проживания мебель и техника.
26
+ Строго без животных, строго Славян. Просмотр в любое время - ключи на руках.
27
+ ''',
28
+ '''Сдам на длительный срок семейной паре, только с гражданством РФ.
29
+ Квартира после косметического ремонта. Без мебели.
30
+ Есть кухонная мебель и мебель в ванной комнате.
31
+ Бытовая техника для проживания присутствует.
32
+ Оплата = аренда + счётчики (свет, вода).
33
+ ''',
34
+ '''В современном доме. Собственник без комиссии.
35
+ Закрытая территория. Доступ через охрану.
36
+ М Прокшино 10 мин пешком.
37
+ Без детей и животных.
38
+ Возможно без залога.
39
+ Счетчики и интернет включены в стоимость
40
+ '''
41
+ ]
42
+
43
+
44
+ def make_output_string(labels):
45
+ output_list = []
46
+ for label in strings.keys():
47
+ if labels[label]:
48
+ output_list.append(strings[label])
49
+ if output_list:
50
+ output_str = ', '.join(output_list).capitalize()
51
+ else:
52
+ output_str = 'Нет особенностей'
53
+ return output_str
54
+
55
+
56
+ def predict_label(text):
57
+ preprocessed_text = preprocessor.preprocess_texts([text])
58
+ print(preprocessed_text)
59
+ if preprocessed_text == [[]]:
60
+ return 'Введите текст объявления!'
61
+ labels = clf.predict_labels(preprocessed_text)
62
+ output_str = make_output_string(labels)
63
+ return output_str
64
+
65
+
66
+ demo = gr.Interface(fn=predict_label, inputs=[gr.Text(label="Текст объявления", lines=5)],
67
+ outputs=[gr.Textbox(label="Особенности объявления")],
68
+ examples=examples)
69
+
70
+ demo.launch()
models/nnet/nnet.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eca046bc6417544613037ccbd7c55537dbfa0d44d181480b0c5aedc32b775877
3
+ size 3474673281
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (156 Bytes). View file
 
src/data/__init__.py ADDED
File without changes
src/data/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (161 Bytes). View file
 
src/data/__pycache__/preprocessing_utils.cpython-310.pyc ADDED
Binary file (1.92 kB). View file
 
src/data/preprocessing_utils.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+ import nltk
3
+ from nltk.corpus import stopwords
4
+ from nltk.tokenize import WordPunctTokenizer
5
+ import pymorphy2
6
+
7
+
8
+ class DataPreprocessor:
9
+
10
+ def __init__(self):
11
+ nltk.download('stopwords')
12
+ self.morph = pymorphy2.MorphAnalyzer()
13
+ self.tokenizer = WordPunctTokenizer()
14
+ self.punctuation = set(string.punctuation)
15
+ self.stopwords_russian = stopwords.words("russian")
16
+ self.stop_tokens = (set(self.stopwords_russian) - {'и', 'или', 'не'}).union(self.punctuation)
17
+
18
+ def tokenize_data(self, texts):
19
+ tokens = [self.tokenizer.tokenize(str(text).lower()) for text in texts]
20
+ return tokens
21
+
22
+ def lemmatize_tokens_string(self, tokens_string):
23
+ new_tokens = []
24
+ for token in tokens_string:
25
+ if token not in self.stop_tokens:
26
+ new_tokens.append(self.morph.parse(token)[0].normal_form)
27
+ return new_tokens
28
+
29
+ def lemmatize_tokens(self, tokens):
30
+ for i in range(len(tokens)):
31
+ tokens[i] = self.lemmatize_tokens_string(tokens[i])
32
+
33
+ def preprocess_texts(self, texts):
34
+ tokens = self.tokenize_data(texts)
35
+ self.lemmatize_tokens(tokens)
36
+ return tokens
src/models/__init__.py ADDED
File without changes
src/models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (163 Bytes). View file
 
src/models/__pycache__/models_utils.cpython-310.pyc ADDED
Binary file (18.4 kB). View file
 
src/models/models_utils.py ADDED
@@ -0,0 +1,591 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import numpy as np
4
+ import pandas as pd
5
+ from gensim.models import KeyedVectors
6
+ from collections import Counter
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+ from sklearn.metrics import roc_auc_score, precision_recall_curve
11
+ import tqdm
12
+ from copy import deepcopy
13
+ import matplotlib.pyplot as plt
14
+ from transformers import DistilBertTokenizer, DistilBertModel
15
+
16
+
17
+ def get_roc_aucs(y, probas):
18
+ y_onehot = pd.get_dummies(y)
19
+ roc_auc_scores = []
20
+ if y_onehot.shape[1] > 2:
21
+ for i in range(y_onehot.shape[1]):
22
+ roc_auc_scores.append(roc_auc_score(y_onehot[i], probas[:, i]))
23
+ roc_auc_scores.append(roc_auc_score(y, probas, multi_class='ovo', average='macro'))
24
+ else:
25
+ roc_auc_scores.append(roc_auc_score(y, probas[:, 1]))
26
+ return roc_auc_scores
27
+
28
+
29
+ def get_max_f1_score(y, probas):
30
+ if probas.shape[1] != 2:
31
+ raise ValueError('Expected probabilities for 2 classes would be given')
32
+ y_onehot = pd.get_dummies(y)
33
+ f1_score = []
34
+ threshold = []
35
+ p, r, t = precision_recall_curve(y, probas[:, 1])
36
+ f1_scores = 2 * p * r / (p + r + 0.001)
37
+ threshold.append(t[np.argmax(f1_scores)])
38
+ f1_score.append(np.max(f1_scores))
39
+ return f1_score, threshold
40
+
41
+
42
+ class RNN(nn.Module):
43
+
44
+ def __init__(self, vectors, n_of_words, n_of_classes, num_layers, bidirectional):
45
+ dim = vectors.shape[1]
46
+ d = 2 if bidirectional else 1
47
+ super().__init__()
48
+ self.emb = nn.Embedding(n_of_words, dim)
49
+ self.emb.load_state_dict({'weight': torch.tensor(vectors)})
50
+ self.emb.weight.requires_grad = False
51
+ self.gru = nn.GRU(input_size=dim, hidden_size=dim, batch_first=True,
52
+ num_layers=num_layers, bidirectional=bidirectional)
53
+ self.linear = nn.Linear(dim * num_layers * d, n_of_classes)
54
+
55
+ def forward(self, batch):
56
+ emb = self.emb(batch)
57
+ _, last_state = self.gru(emb)
58
+ last_state = torch.permute(last_state, (1, 0, 2)).reshape(1, batch.shape[0], -1).squeeze()
59
+ out = self.linear(last_state.squeeze())
60
+ if len(out.size()) == 1:
61
+ out = out.unsqueeze(0)
62
+ return out
63
+
64
+
65
+ class DistilBERTClass(torch.nn.Module):
66
+ def __init__(self, n_classes):
67
+ super().__init__()
68
+ self.l1 = DistilBertModel.from_pretrained('DeepPavlov/distilrubert-small-cased-conversational')
69
+ self.linear = torch.nn.Linear(768, n_classes)
70
+
71
+ def forward(self, input_ids, attention_mask, token_type_ids):
72
+ output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
73
+ hidden_state = output_1[0]
74
+ pooler = hidden_state[:, 0]
75
+ output = self.linear(pooler)
76
+ return output
77
+
78
+
79
+ class BaseClassifier:
80
+
81
+ def __init__(self, batch_size=16, epochs=100):
82
+ self.batch_size = batch_size
83
+ self.epochs = epochs
84
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
85
+
86
+ def preprocess_with_random_initialization(self, train_tokens):
87
+ self.pad_idx = 0
88
+ self.unk_idx = 1
89
+
90
+ set_of_words = set()
91
+ for tokens_string in train_tokens:
92
+ set_of_words.update(tokens_string)
93
+
94
+ self.idx_to_word = ['PADDING', 'UNK'] + list(set_of_words)
95
+ self.word_to_idx = {key: i for i, key in enumerate(self.idx_to_word)}
96
+ self.amount_of_words = len(self.idx_to_word)
97
+
98
+ self.vectors = np.zeros((len(self.idx_to_word), 300))
99
+ self.vectors[0, :] = np.zeros(300)
100
+ self.vectors[1:len(self.idx_to_word), :] = (np.random.rand(len(self.idx_to_word) - 1, 300) - 0.5) / 300
101
+
102
+ def preprocess(self, vectors_file_path):
103
+ self.emb = KeyedVectors.load_word2vec_format(vectors_file_path)
104
+
105
+ self.pad_idx = 0
106
+ self.unk_idx = 1
107
+
108
+ self.idx_to_word = ['PADDING', 'UNK'] + list(self.emb.index_to_key)
109
+ self.word_to_idx = {key: i for i, key in enumerate(self.idx_to_word)}
110
+ self.amount_of_words = len(self.idx_to_word)
111
+
112
+ self.vectors = np.zeros((len(self.idx_to_word), 300))
113
+ self.vectors[0, :] = np.zeros(300)
114
+ self.vectors[1, :] = (np.random.rand(300) - 0.5) / 300
115
+ for i in range(2, len(self.idx_to_word)):
116
+ self.vectors[i, :] = self.emb.get_vector(self.idx_to_word[i])
117
+
118
+ def fit(self, train_tokens, y_train, test_tokens=None, y_test=None,
119
+ reinitialize=True, stop_epochs=None, show_logs=False):
120
+ if reinitialize:
121
+ self.n_of_classes = y_train.nunique()
122
+ self.initialize_nnet()
123
+
124
+ self.print_test = test_tokens and y_test
125
+ self.stop_epochs = stop_epochs
126
+ train_scores = []
127
+ self.train_scores_mean = []
128
+ self.test_scores = []
129
+ self.test_aucs = []
130
+ self.test_f1 = []
131
+ criterion = nn.CrossEntropyLoss()
132
+ for epoch in tqdm.tqdm(range(self.epochs)):
133
+ self.epoch = epoch
134
+ self.nnet.train()
135
+ train_batches = self.batch_generator(train_tokens, y_train)
136
+ test_batches = self.batch_generator(test_tokens, y_test)
137
+ for i, batch in tqdm.tqdm(
138
+ enumerate(train_batches),
139
+ total=len(train_tokens) // self.batch_size
140
+ ):
141
+ pred = self.nnet(batch['tokens'])
142
+ loss = criterion(pred, batch['labels'])
143
+ self.optimizer.zero_grad()
144
+ loss.backward()
145
+ self.optimizer.step()
146
+ if show_logs and i % 400 == 0:
147
+ train_score = criterion(self.nnet(batch['tokens']), batch['labels'])
148
+ print(train_score.item())
149
+ train_scores.append(train_score.item())
150
+ if show_logs:
151
+ self.train_scores_mean.append(sum(train_scores) / len(train_scores))
152
+ train_scores = []
153
+ if self.print_test:
154
+ test_pred_prob = torch.tensor([], device='cpu')
155
+ with torch.no_grad():
156
+ self.nnet.eval()
157
+ for batch in test_batches:
158
+ test_batch_pred_prob = self.nnet(batch['tokens'])
159
+ test_batch_pred_prob_cpu = test_batch_pred_prob.to('cpu')
160
+ test_pred_prob = torch.cat((test_pred_prob, test_batch_pred_prob_cpu), 0)
161
+ test_score = criterion(test_pred_prob, torch.tensor(y_test.values, device='cpu'))
162
+ self.test_scores.append(test_score.item())
163
+ test_pred_probas = F.softmax(test_pred_prob).detach().cpu().numpy()
164
+ self.test_aucs.append(get_roc_aucs(y_test, test_pred_probas))
165
+ self.test_f1.append(get_max_f1_score(y_test, test_pred_probas)[0])
166
+ self.print_metrics()
167
+ if self.early_stopping_check():
168
+ break
169
+
170
+ def count_tokens(self, tokens):
171
+ self.words_counter = Counter()
172
+ self.amount_of_tokens = 0
173
+ for s in tokens:
174
+ self.words_counter.update(s)
175
+ self.amount_of_tokens += len(s)
176
+
177
+ def index_tokens(self, tokens_string):
178
+ return [self.word_to_idx.get(token, self.unk_idx) for token in tokens_string]
179
+
180
+ def fill_with_pads(self, tokens):
181
+ tokens = deepcopy(tokens)
182
+ max_len = 0
183
+ for tokens_string in tokens:
184
+ max_len = max(max_len, len(tokens_string))
185
+ for tokens_string in tokens:
186
+ for i in range(len(tokens_string), max_len):
187
+ tokens_string.append(self.pad_idx)
188
+ return tokens
189
+
190
+ def as_matrix(self, tokens):
191
+ tokens = deepcopy(tokens)
192
+ for j, s in enumerate(tokens):
193
+ tokens[j] = self.index_tokens(s)
194
+ tokens = self.fill_with_pads(tokens)
195
+ return tokens
196
+
197
+ def batch_generator(self, tokens, labels=None):
198
+ for i in range(0, len(tokens), self.batch_size):
199
+ batch_tokens = tokens[i: i + self.batch_size]
200
+ if labels:
201
+ batch_labels = torch.tensor(labels.values[i: i + self.batch_size],
202
+ dtype=torch.long,
203
+ device=self.device)
204
+ else:
205
+ batch_labels = None
206
+
207
+ batch_tokens_idx = torch.tensor(self.as_matrix(batch_tokens),
208
+ dtype=torch.int,
209
+ device=self.device)
210
+ if len(batch_tokens_idx.size()) == 1:
211
+ batch_tokens_idx = torch.unsqueeze(batch_tokens_idx, 0)
212
+
213
+ batch = {
214
+ 'tokens': batch_tokens_idx,
215
+ 'labels': batch_labels
216
+ }
217
+ yield batch
218
+
219
+ def print_metrics(self, print_test=True):
220
+
221
+ if self.print_test:
222
+ print(f'epoch {self.epoch}/{self.epochs}')
223
+ print('auc', self.test_aucs[-1])
224
+ print('score', self.test_scores[-1])
225
+ print('f1 score', self.test_f1[-1])
226
+
227
+ legend_labels = []
228
+ if self.n_of_classes > 2:
229
+ for i in range(self.n_of_classes):
230
+ legend_labels.append(f'Class {i}')
231
+ legend_labels.append('General')
232
+
233
+ plt.figure(figsize=(5, 15))
234
+
235
+ plt.clf()
236
+
237
+ plt.subplot(3, 1, 1)
238
+ plt.plot(np.arange(1, self.epoch + 2), self.test_aucs)
239
+ plt.grid()
240
+ plt.title('Test ROC AUC')
241
+ plt.xlabel('Num. of epochs')
242
+ plt.ylabel('ROC AUC')
243
+ plt.legend(legend_labels)
244
+
245
+ plt.subplot(3, 1, 2)
246
+ plt.plot(np.arange(1, self.epoch + 2), self.test_f1)
247
+ plt.grid()
248
+ plt.title('Test F1-score')
249
+ plt.xlabel('Num. of epochs')
250
+ plt.ylabel('F1-score')
251
+ plt.legend(legend_labels)
252
+
253
+ plt.subplot(3, 1, 3)
254
+ plt.plot(np.arange(1, self.epoch + 2), self.train_scores_mean, label='Train loss')
255
+ plt.plot(np.arange(1, self.epoch + 2), self.test_scores, label='Test loss')
256
+ plt.title('Loss')
257
+ plt.xlabel('Num. of epochs')
258
+ plt.ylabel('Loss')
259
+ plt.legend()
260
+ plt.grid()
261
+ plt.draw()
262
+
263
+ else:
264
+ plt.figure(figsize=(5, 15))
265
+ plt.plot(np.arange(1, self.epoch + 2), self.train_scores_mean, label='Train loss')
266
+ plt.title('Loss')
267
+ plt.xlabel('Num. of epochs')
268
+ plt.ylabel('Loss')
269
+ plt.legend()
270
+ plt.grid()
271
+ plt.show()
272
+
273
+ def early_stopping_check(self):
274
+ if self.stop_epochs is None or self.stop_epochs >= len(self.test_scores):
275
+ return False
276
+ else:
277
+ print(self.test_scores)
278
+ first_score = np.array(self.test_scores)[-self.stop_epochs - 1]
279
+ last_scores = np.array(self.test_scores)[-self.stop_epochs:]
280
+ return np.all(last_scores >= first_score)
281
+
282
+ def predict_proba(self, tokens, labels):
283
+ batches = self.batch_generator(tokens, labels)
284
+ pred_probas = torch.tensor([], device=self.device)
285
+ with torch.no_grad():
286
+ self.nnet.eval()
287
+ for batch in batches:
288
+ batch_prob = self.nnet(batch['tokens'])
289
+ pred_probas = torch.cat((pred_probas, batch_prob))
290
+ return F.softmax(pred_probas).detach().cpu().numpy()
291
+
292
+
293
+ class RNNClassifier(BaseClassifier):
294
+
295
+ def __init__(self, batch_size=16, epochs=100,
296
+ num_layers=1, bidirectional=False):
297
+ self.batch_size = batch_size
298
+ self.epochs = epochs
299
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
300
+ self.num_layers = num_layers
301
+ self.bidirectional = bidirectional
302
+
303
+ def initialize_nnet(self):
304
+ self.nnet = RNN(self.vectors, self.amount_of_words,
305
+ n_of_classes=self.n_of_classes,
306
+ num_layers=self.num_layers,
307
+ bidirectional=self.bidirectional).to(self.device)
308
+ self.optimizer = torch.optim.Adam(self.nnet.parameters())
309
+
310
+ def save_model(self, filepath):
311
+ with open(filepath, 'wb') as file:
312
+ torch.save(self.nnet.state_dict(), file)
313
+
314
+ def load_model(self, filepath, amount_of_words):
315
+ self.amount_of_words = amount_of_words
316
+ self.vectors = np.zeros((amount_of_words, 300))
317
+ self.n_of_classes = 2
318
+ self.nnet = RNN(self.vectors, self.amount_of_words,
319
+ n_of_classes=self.n_of_classes,
320
+ num_layers=self.num_layers,
321
+ bidirectional=self.bidirectional).to(self.device)
322
+ self.nnet.load_state_dict(torch.load(filepath, map_location=self.device))
323
+
324
+
325
+ class DBERTClassifier(BaseClassifier):
326
+
327
+ def __init__(self, batch_size=16, epochs=100):
328
+ self.batch_size = batch_size
329
+ self.epochs = epochs
330
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
331
+
332
+ def initialize_nnet(self):
333
+ self.nnet = DistilBERTClass(self.n_of_classes).to(self.device)
334
+ self.optimizer = torch.optim.Adam(self.nnet.parameters(), lr=2e-6)
335
+ # 'DeepPavlov/rubert-base-cased' 'DeepPavlov/distilrubert-small-cased-conversational',
336
+ self.tokenizer = DistilBertTokenizer.from_pretrained('DeepPavlov/distilrubert-small-cased-conversational',
337
+ do_lower_case=True)
338
+
339
+ def batch_generator(self, tokens, labels=None):
340
+ for i in range(0, len(tokens), self.batch_size):
341
+ batch_tokens = tokens[i: i + self.batch_size]
342
+ batch_tokens = [' '.join(s) for s in batch_tokens]
343
+ if labels:
344
+ batch_labels = torch.tensor(labels.values[i: i + self.batch_size],
345
+ dtype=torch.long,
346
+ device=self.device)
347
+ else:
348
+ batch_labels = None
349
+ if len(batch_tokens) == 1:
350
+ inputs = self.tokenizer.encode_plus(
351
+ batch_tokens,
352
+ None,
353
+ add_special_tokens=True,
354
+ max_length=512,
355
+ truncation=True,
356
+ pad_to_max_length=True,
357
+ return_token_type_ids=True
358
+ )
359
+ else:
360
+ inputs = self.tokenizer.batch_encode_plus(
361
+ batch_tokens,
362
+ add_special_tokens=True,
363
+ max_length=512,
364
+ truncation=True,
365
+ pad_to_max_length=True,
366
+ return_token_type_ids=True
367
+ )
368
+ batch_token_ids = torch.tensor(inputs['input_ids'], device=self.device, dtype=torch.long)
369
+ batch_mask = torch.tensor(inputs['attention_mask'], device=self.device, dtype=torch.long)
370
+ batch_token_type_ids = torch.tensor(inputs["token_type_ids"], device=self.device, dtype=torch.long)
371
+ if len(batch_tokens) == 1:
372
+ batch_token_ids = batch_token_ids.unsqueeze(0)
373
+ batch_mask = batch_mask.unsqueeze(0)
374
+ batch_token_type_ids = batch_token_type_ids.unsqueeze(0)
375
+ batch = {
376
+ 'tokens': batch_token_ids,
377
+ 'mask': batch_mask,
378
+ 'token_type_ids': batch_token_type_ids,
379
+ 'labels': batch_labels
380
+ }
381
+ yield batch
382
+
383
+ def fit(self, train_tokens, y_train, test_tokens=None, y_test=None,
384
+ reinitialize=True, stop_epochs=None, show_logs=False):
385
+ if reinitialize:
386
+ self.n_of_classes = y_train.nunique()
387
+ self.initialize_nnet()
388
+
389
+ self.stop_epochs = stop_epochs
390
+ self.print_test = test_tokens and y_test
391
+ train_scores = []
392
+ self.train_scores_mean = []
393
+ self.test_scores = []
394
+ self.test_aucs = []
395
+ self.test_f1 = []
396
+ criterion = nn.CrossEntropyLoss()
397
+ for epoch in tqdm.tqdm(range(self.epochs)):
398
+ self.epoch = epoch
399
+ self.nnet.train()
400
+ train_batches = self.batch_generator(train_tokens, y_train)
401
+ test_batches = self.batch_generator(test_tokens, y_test)
402
+ for i, batch in tqdm.tqdm(
403
+ enumerate(train_batches),
404
+ total=len(train_tokens) // self.batch_size
405
+ ):
406
+ pred = self.nnet(batch['tokens'], batch['mask'], batch['token_type_ids'])
407
+ loss = criterion(pred, batch['labels'])
408
+ self.optimizer.zero_grad()
409
+ loss.backward()
410
+ self.optimizer.step()
411
+ if show_logs and i % 400 == 0:
412
+ train_score = criterion(self.nnet(batch['tokens'], batch['mask'], batch['token_type_ids']),
413
+ batch['labels'])
414
+ print(train_score.item())
415
+ train_scores.append(train_score.item())
416
+ if show_logs:
417
+ self.train_scores_mean.append(sum(train_scores) / len(train_scores))
418
+ train_scores = []
419
+ if self.print_test:
420
+ test_pred_prob = torch.tensor([], device='cpu')
421
+ with torch.no_grad():
422
+ self.nnet.eval()
423
+ for batch in test_batches:
424
+ test_batch_pred_prob = self.nnet(batch['tokens'], batch['mask'], batch['token_type_ids'])
425
+ test_batch_pred_prob_cpu = test_batch_pred_prob.to('cpu')
426
+ test_pred_prob = torch.cat((test_pred_prob, test_batch_pred_prob_cpu), 0)
427
+ test_score = criterion(test_pred_prob, torch.tensor(y_test.values, device='cpu'))
428
+ self.test_scores.append(test_score.item())
429
+ test_pred_probas = F.softmax(test_pred_prob).detach().cpu().numpy()
430
+ self.test_aucs.append(get_roc_aucs(y_test, test_pred_probas))
431
+ self.test_f1.append(get_max_f1_score(y_test, test_pred_probas)[0])
432
+ self.print_metrics()
433
+ if self.early_stopping_check():
434
+ break
435
+
436
+ def predict_proba(self, tokens, labels):
437
+ batches = self.batch_generator(tokens, labels)
438
+ pred_probas = torch.tensor([], device=self.device)
439
+ with torch.no_grad():
440
+ self.nnet.eval()
441
+ for batch in batches:
442
+ batch_prob = self.nnet(batch['tokens'], batch['mask'],
443
+ batch['token_type_ids'])
444
+ pred_probas = torch.cat((pred_probas, batch_prob))
445
+ return F.softmax(pred_probas).detach().cpu().numpy()
446
+
447
+ def predict(self, tokens, labels):
448
+ return np.argmax(self.predict_proba(tokens, labels), axis=1)
449
+
450
+ def save_model(self, filepath):
451
+ with open(filepath, 'wb') as file:
452
+ torch.save(self.nnet.state_dict(), file)
453
+
454
+ def load_model(self, filepath):
455
+ self.n_of_classes = 2
456
+ self.nnet = DistilBERTClass(self.n_of_classes).to(self.device)
457
+ self.optimizer = torch.optim.Adam(self.nnet.parameters(), lr=2e-6)
458
+ self.tokenizer = DistilBertTokenizer.from_pretrained(
459
+ 'DeepPavlov/distilrubert-small-cased-conversational',
460
+ do_lower_case=True
461
+ )
462
+ self.nnet.load_state_dict(torch.load(filepath, map_location=self.device))
463
+
464
+
465
+ class AdClassifier:
466
+
467
+ def __init__(self, weights_folder, dictionary_path):
468
+ self.batch_size = 16
469
+
470
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
471
+
472
+ self.pad_idx = 0
473
+ self.unk_idx = 1
474
+
475
+ with open(dictionary_path, 'rb') as file:
476
+ self.word_to_idx = pickle.load(file)
477
+
478
+ self.tokenizer = DistilBertTokenizer.from_pretrained(
479
+ 'DeepPavlov/distilrubert-small-cased-conversational',
480
+ do_lower_case=True
481
+ )
482
+
483
+ nationality_nn_path = os.path.join(weights_folder, 'model_nationality.pt')
484
+ families_nn_path = os.path.join(weights_folder, 'model_families.pt')
485
+ sex_nn_path = os.path.join(weights_folder, 'model_sex.pt')
486
+ limit_nn_path = os.path.join(weights_folder, 'model_limit.pt')
487
+
488
+ self.nationality_clf = DBERTClassifier()
489
+ self.nationality_clf.load_model(nationality_nn_path)
490
+
491
+ self.families_clf = DBERTClassifier()
492
+ self.families_clf.load_model(families_nn_path)
493
+
494
+ self.sex_clf = DBERTClassifier()
495
+ self.sex_clf.load_model(sex_nn_path)
496
+
497
+ self.limit_clf = RNNClassifier(bidirectional=True)
498
+ self.limit_clf.load_model(limit_nn_path, amount_of_words=len(self.word_to_idx))
499
+
500
+ def index_tokens(self, tokens_string):
501
+ return [self.word_to_idx.get(token, self.unk_idx) for token in tokens_string]
502
+
503
+ def fill_with_pads(self, tokens):
504
+ tokens = deepcopy(tokens)
505
+ max_len = 0
506
+ for tokens_string in tokens:
507
+ max_len = max(max_len, len(tokens_string))
508
+ for tokens_string in tokens:
509
+ for i in range(len(tokens_string), max_len):
510
+ tokens_string.append(self.pad_idx)
511
+ return tokens
512
+
513
+ def as_matrix(self, tokens):
514
+ tokens = deepcopy(tokens)
515
+ for j, s in enumerate(tokens):
516
+ tokens[j] = self.index_tokens(s)
517
+ tokens = self.fill_with_pads(tokens)
518
+ return tokens
519
+
520
+ def batch_generator(self, tokens):
521
+ for i in range(0, len(tokens), self.batch_size):
522
+ batch_tokens = tokens[i: i + self.batch_size]
523
+ batch_tokens = [' '.join(s) for s in batch_tokens]
524
+ inputs = self.tokenizer.batch_encode_plus(
525
+ batch_tokens,
526
+ add_special_tokens=True,
527
+ max_length=512,
528
+ truncation=True,
529
+ pad_to_max_length=True,
530
+ return_token_type_ids=True
531
+ )
532
+ batch_token_ids = torch.tensor(inputs['input_ids'], device=self.device, dtype=torch.long)
533
+ batch_mask = torch.tensor(inputs['attention_mask'], device=self.device, dtype=torch.long)
534
+ batch_token_type_ids = torch.tensor(inputs['token_type_ids'], device=self.device, dtype=torch.long)
535
+
536
+ batch_tokens_rnn = tokens[i: i + self.batch_size]
537
+ batch_tokens_rnn_ids = torch.tensor(self.as_matrix(batch_tokens_rnn),
538
+ dtype=torch.int,
539
+ device=self.device)
540
+ batch = {
541
+ 'tokens': batch_token_ids,
542
+ 'mask': batch_mask,
543
+ 'token_type_ids': batch_token_type_ids,
544
+ 'tokens_rnn': batch_tokens_rnn_ids
545
+ }
546
+ yield batch
547
+
548
+ def predict_probas(self, tokens):
549
+ batches = self.batch_generator(tokens)
550
+ pred_probas = {'nationality': torch.tensor([], device=self.device),
551
+ 'families': torch.tensor([], device=self.device),
552
+ 'sex': torch.tensor([], device=self.device),
553
+ 'limit': torch.tensor([], device=self.device)}
554
+ batch_probas = dict()
555
+ with torch.no_grad():
556
+ self.nationality_clf.nnet.eval()
557
+ self.families_clf.nnet.eval()
558
+ self.sex_clf.nnet.eval()
559
+ self.limit_clf.nnet.eval()
560
+ for batch in batches:
561
+ batch_probas['nationality'] = self.nationality_clf.nnet(batch['tokens'], batch['mask'],
562
+ batch['token_type_ids'])
563
+ batch_probas['families'] = self.families_clf.nnet(batch['tokens'], batch['mask'],
564
+ batch['token_type_ids'])
565
+ batch_probas['sex'] = self.sex_clf.nnet(batch['tokens'], batch['mask'],
566
+ batch['token_type_ids'])
567
+ batch_probas['limit'] = self.limit_clf.nnet(batch['tokens_rnn'])
568
+ for batch_prob_label in batch_probas:
569
+ pred_probas[batch_prob_label] = torch.cat((pred_probas[batch_prob_label],
570
+ batch_probas[batch_prob_label]))
571
+ for pred_prob_label in pred_probas:
572
+ pred_probas[pred_prob_label] = F.softmax(pred_probas[pred_prob_label]).\
573
+ detach().cpu().numpy()
574
+ return pred_probas
575
+
576
+ def predict_labels(self, tokens):
577
+ predicted_probas = self.predict_probas(tokens)
578
+ predicted_labels = dict()
579
+ thresholds = {
580
+ 'nationality': 0.75,
581
+ 'families': 0.7,
582
+ 'sex': 0.25,
583
+ 'limit': 0.42
584
+ }
585
+ for label in predicted_probas:
586
+ predicted_labels[label] = predicted_probas[label][:, 1] >= thresholds[label]
587
+ return predicted_labels
588
+
589
+ def save_model(self, filepath):
590
+ with open(filepath, 'wb') as file:
591
+ torch.save(self, file)