Test_Pipeline_dev

Sleeping

App Files Files Community

Test_Pipeline_dev / utils.py

fruitpicker01

Update utils.py

6a6025e verified about 2 months ago

raw

history blame

4.92 kB

	import re
	import numpy as np
	import pandas as pd
	import pymorphy2
	import torch
	import torch.nn.functional as F
	from transformers import AutoTokenizer, AutoModel

	morph = pymorphy2.MorphAnalyzer()
	tokenizer = AutoTokenizer.from_pretrained("ai-forever/ru-en-RoSBERTa")
	model = AutoModel.from_pretrained("ai-forever/ru-en-RoSBERTa")

	def cosine_similarity(embedding1, embedding2):
	embedding1 = np.array(embedding1)
	embedding2 = np.array(embedding2)

	dot_product = np.dot(embedding1, embedding2)
	norm_a = np.linalg.norm(embedding1)
	norm_b = np.linalg.norm(embedding2)

	return dot_product / (norm_a * norm_b)

	def pool(hidden_state, mask, pooling_method="cls"):
	if pooling_method == "mean":
	s = torch.sum(hidden_state * mask.unsqueeze(-1).float(), dim=1)
	d = mask.sum(axis=1, keepdim=True).float()
	return s / d
	elif pooling_method == "cls":
	return hidden_state[:, 0]

	def text_to_embedding(text, tokenizer, model):
	# Токенизация текста
	tokenized_inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors="pt")
	with torch.no_grad():
	outputs = model(**tokenized_inputs)

	embeddings = pool(
	outputs.last_hidden_state,
	tokenized_inputs["attention_mask"],
	pooling_method="cls" # or try "mean"
	)

	embeddings = F.normalize(embeddings, p=2, dim=1).numpy()

	return embeddings

	def preprocess_text(text):
	lemmas = [] # Для хранения лемм
	for token in text.split():
	parsed = morph.parse(token)[0] # Морфологический разбор токена

	# Лемматизация
	if parsed.normal_form and parsed.normal_form.strip():
	lemmas.append(parsed.normal_form) # Добавляем лемму

	return " ".join(lemmas) if lemmas else ""

	def product_extraction(text):
	lemmas = preprocess_text(text)
	if 'кредитный бизнес-' in lemmas:
	return 'кредитная бизнес-карта'
	elif 'выпустить бизнес-карта' in lemmas:
	return 'бизнес-карта'
	elif ('расчётный счёт' in lemmas) or ('открыть счёт' in lemmas):
	return 'расчетный счет'
	elif 'бизнес-карта' in lemmas:
	return 'бизнес-карта'
	elif 'бизнес-кешбэк' in lemmas:
	return 'cashback'
	elif 'перевод' in lemmas:
	return 'переводы'
	elif 'кредит' in lemmas:
	return 'кредит'
	elif 'эквайринг' in lemmas:
	return 'эквайринг'
	elif 'зарплатный проект' in lemmas:
	return 'зарплатный проект'
	elif 'вклад' in lemmas:
	return 'вклад'
	elif 'депозит' in lemmas:
	return 'депозит'
	return 'прочее'

	def best_text_choice(texts, core_df, tokenizer, model, coef=1):
	'''
	Функция для выбора лучшего текста, и оценки его успешности
	'''
	scoring_list = []
	embeddings_df = core_df.copy()
	texts_df = pd.DataFrame(texts, columns=['texts'])
	texts_df['texts_lower'] = texts_df['texts'].apply(lambda x: x.lower())
	texts_df['texts_'] = 'search_query: ' + texts_df['texts_lower']
	texts_df['embeddings'] = texts_df['texts_'].apply(lambda x: text_to_embedding(x, tokenizer, model)[0])
	texts_df['product'] = texts_df['texts'].apply(product_extraction)
	best_text = ''
	score = 0
	for index, row in texts_df.iterrows():
	product = row['product']
	embeddings_df['similarity'] = embeddings_df['embedding'].apply(lambda x: cosine_similarity(x, row['embeddings']))
	embeddings_df['score'] = embeddings_df['value'] * embeddings_df['similarity']
	score_ = np.mean([(embeddings_df
	.sort_values(by=['product_type', 'score'], ascending=[True, False])
	.query('product_type == @product')['score'][:3].mean() * coef),
	embeddings_df
	.sort_values(by='similarity', ascending=False)
	.query('product_type != @product')['score'][:3].mean()])
	scoring_list.append([row['texts'], 100*score_ / embeddings_df.query('product_type == @product')['value'].max()])
	if score_ > score:
	score = score_
	best_text = row['texts']

	# ratio = score / embeddings_df.query('product_type == @product')['value'].max()
	scoring_df = pd.DataFrame(scoring_list, columns=['text', 'score'])
	scoring_df = scoring_df.sort_values(by='score', ascending=False).reset_index(drop=True)
	scoring_df.index += 1
	return scoring_df.reset_index().rename(columns={'index': 'Место'})[['Место', 'text']]