{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/Anastasia/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from transformers import BertTokenizer, BertModel\n", "import torch\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import numpy as np\n", "import time\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "(array([ 5517, 9066, 13361, 11717, 320, 10793, 14201, 9305, 9199,\n", " 8294]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))\n", "3.533276081085205\n" ] } ], "source": [ "start_time = time.time()\n", "\n", "\n", "# Читаем вектора сериалов\n", "embeddings = np.loadtxt('data/embs.txt')\n", "# Указываем пути к сохраненным модели и токенизатору\n", "model_path = \"model\"\n", "tokenizer_path = \"tokenizer\"\n", "\n", "# Загружаем модель\n", "loaded_model = BertModel.from_pretrained(model_path)\n", "\n", "# Загружаем токенизатор\n", "loaded_tokenizer = BertTokenizer.from_pretrained(tokenizer_path)\n", "\n", "\n", "# Векторизуем запрос\n", "loaded_model.eval()\n", "tokens = loaded_tokenizer('петух закукарекал', return_tensors=\"pt\", padding=True, truncation=True)\n", "\n", "# Переместите токены на тот же устройство, что и модель\n", "tokens = {key: value.to(loaded_model.device) for key, value in tokens.items()}\n", "\n", "# Передача токенов в модель для получения эмбеддингов\n", "with torch.no_grad():\n", " output = loaded_model(**tokens)\n", "\n", "# Эмбеддинги получаются из последнего скрытого состояния\n", "user_embedding = output.last_hidden_state.mean(dim=1).squeeze().cpu().detach().numpy()\n", "\n", "\n", "\n", "cosine_similarities = cosine_similarity(embeddings, user_embedding.reshape(1, -1))\n", "\n", "# Получаем 10 наиболее подходящих строк-индексов в массиве нампай\n", "top_10_indices = np.unravel_index(np.argsort(cosine_similarities, axis=None)[-10:], cosine_similarities.shape)\n", "print(top_10_indices)\n", "end_time = time.time()\n", "print(end_time-start_time)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[5517, 9066, 13361, 11717, 320, 10793, 14201, 9305, 9199, 8294]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(top_10_indices[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".elbrus2", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }