fruitpicker01
commited on
Commit
•
6a6025e
1
Parent(s):
677b493
Update utils.py
Browse files
utils.py
CHANGED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
import pymorphy2
|
5 |
+
import torch
|
6 |
+
import torch.nn.functional as F
|
7 |
+
from transformers import AutoTokenizer, AutoModel
|
8 |
+
|
9 |
+
morph = pymorphy2.MorphAnalyzer()
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ru-en-RoSBERTa")
|
11 |
+
model = AutoModel.from_pretrained("ai-forever/ru-en-RoSBERTa")
|
12 |
+
|
13 |
+
def cosine_similarity(embedding1, embedding2):
|
14 |
+
embedding1 = np.array(embedding1)
|
15 |
+
embedding2 = np.array(embedding2)
|
16 |
+
|
17 |
+
dot_product = np.dot(embedding1, embedding2)
|
18 |
+
norm_a = np.linalg.norm(embedding1)
|
19 |
+
norm_b = np.linalg.norm(embedding2)
|
20 |
+
|
21 |
+
return dot_product / (norm_a * norm_b)
|
22 |
+
|
23 |
+
def pool(hidden_state, mask, pooling_method="cls"):
|
24 |
+
if pooling_method == "mean":
|
25 |
+
s = torch.sum(hidden_state * mask.unsqueeze(-1).float(), dim=1)
|
26 |
+
d = mask.sum(axis=1, keepdim=True).float()
|
27 |
+
return s / d
|
28 |
+
elif pooling_method == "cls":
|
29 |
+
return hidden_state[:, 0]
|
30 |
+
|
31 |
+
def text_to_embedding(text, tokenizer, model):
|
32 |
+
# Токенизация текста
|
33 |
+
tokenized_inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors="pt")
|
34 |
+
with torch.no_grad():
|
35 |
+
outputs = model(**tokenized_inputs)
|
36 |
+
|
37 |
+
embeddings = pool(
|
38 |
+
outputs.last_hidden_state,
|
39 |
+
tokenized_inputs["attention_mask"],
|
40 |
+
pooling_method="cls" # or try "mean"
|
41 |
+
)
|
42 |
+
|
43 |
+
embeddings = F.normalize(embeddings, p=2, dim=1).numpy()
|
44 |
+
|
45 |
+
return embeddings
|
46 |
+
|
47 |
+
def preprocess_text(text):
|
48 |
+
lemmas = [] # Для хранения лемм
|
49 |
+
for token in text.split():
|
50 |
+
parsed = morph.parse(token)[0] # Морфологический разбор токена
|
51 |
+
|
52 |
+
# Лемматизация
|
53 |
+
if parsed.normal_form and parsed.normal_form.strip():
|
54 |
+
lemmas.append(parsed.normal_form) # Добавляем лемму
|
55 |
+
|
56 |
+
return " ".join(lemmas) if lemmas else ""
|
57 |
+
|
58 |
+
def product_extraction(text):
|
59 |
+
lemmas = preprocess_text(text)
|
60 |
+
if 'кредитный бизнес-' in lemmas:
|
61 |
+
return 'кредитная бизнес-карта'
|
62 |
+
elif 'выпустить бизнес-карта' in lemmas:
|
63 |
+
return 'бизнес-карта'
|
64 |
+
elif ('расчётный счёт' in lemmas) or ('открыть счёт' in lemmas):
|
65 |
+
return 'расчетный счет'
|
66 |
+
elif 'бизнес-карта' in lemmas:
|
67 |
+
return 'бизнес-карта'
|
68 |
+
elif 'бизнес-кешбэк' in lemmas:
|
69 |
+
return 'cashback'
|
70 |
+
elif 'перевод' in lemmas:
|
71 |
+
return 'переводы'
|
72 |
+
elif 'кредит' in lemmas:
|
73 |
+
return 'кредит'
|
74 |
+
elif 'эквайринг' in lemmas:
|
75 |
+
return 'эквайринг'
|
76 |
+
elif 'зарплатный проект' in lemmas:
|
77 |
+
return 'зарплатный проект'
|
78 |
+
elif 'вклад' in lemmas:
|
79 |
+
return 'вклад'
|
80 |
+
elif 'депозит' in lemmas:
|
81 |
+
return 'депозит'
|
82 |
+
return 'прочее'
|
83 |
+
|
84 |
+
def best_text_choice(texts, core_df, tokenizer, model, coef=1):
|
85 |
+
'''
|
86 |
+
Функция для выбора лучшего текста, и оценки его успешности
|
87 |
+
'''
|
88 |
+
scoring_list = []
|
89 |
+
embeddings_df = core_df.copy()
|
90 |
+
texts_df = pd.DataFrame(texts, columns=['texts'])
|
91 |
+
texts_df['texts_lower'] = texts_df['texts'].apply(lambda x: x.lower())
|
92 |
+
texts_df['texts_'] = 'search_query: ' + texts_df['texts_lower']
|
93 |
+
texts_df['embeddings'] = texts_df['texts_'].apply(lambda x: text_to_embedding(x, tokenizer, model)[0])
|
94 |
+
texts_df['product'] = texts_df['texts'].apply(product_extraction)
|
95 |
+
best_text = ''
|
96 |
+
score = 0
|
97 |
+
for index, row in texts_df.iterrows():
|
98 |
+
product = row['product']
|
99 |
+
embeddings_df['similarity'] = embeddings_df['embedding'].apply(lambda x: cosine_similarity(x, row['embeddings']))
|
100 |
+
embeddings_df['score'] = embeddings_df['value'] * embeddings_df['similarity']
|
101 |
+
score_ = np.mean([(embeddings_df
|
102 |
+
.sort_values(by=['product_type', 'score'], ascending=[True, False])
|
103 |
+
.query('product_type == @product')['score'][:3].mean() * coef),
|
104 |
+
embeddings_df
|
105 |
+
.sort_values(by='similarity', ascending=False)
|
106 |
+
.query('product_type != @product')['score'][:3].mean()])
|
107 |
+
scoring_list.append([row['texts'], 100*score_ / embeddings_df.query('product_type == @product')['value'].max()])
|
108 |
+
if score_ > score:
|
109 |
+
score = score_
|
110 |
+
best_text = row['texts']
|
111 |
+
|
112 |
+
# ratio = score / embeddings_df.query('product_type == @product')['value'].max()
|
113 |
+
scoring_df = pd.DataFrame(scoring_list, columns=['text', 'score'])
|
114 |
+
scoring_df = scoring_df.sort_values(by='score', ascending=False).reset_index(drop=True)
|
115 |
+
scoring_df.index += 1
|
116 |
+
return scoring_df.reset_index().rename(columns={'index': 'Место'})[['Место', 'text']]
|