Vladislawoo commited on
Commit
6e57eae
1 Parent(s): 2636684

Upload faiss.ipynb

Browse files
Files changed (1) hide show
  1. faiss.ipynb +102 -0
faiss.ipynb ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "id": "d82abfc8-1e95-41f0-a9af-4946de3ad846",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "ДХЛ. Красная шапочка и другие сказки\n",
14
+ "Ослиная шкура\n",
15
+ "Рождественское чудо мистера Туми\n"
16
+ ]
17
+ }
18
+ ],
19
+ "source": [
20
+ "import pandas as pd\n",
21
+ "import torch\n",
22
+ "import numpy as np\n",
23
+ "from transformers import AutoTokenizer, AutoModel\n",
24
+ "import faiss\n",
25
+ "\n",
26
+ "# Загрузка модели и токенизатора BERT\n",
27
+ "model_name = \"cointegrated/rubert-tiny2\"\n",
28
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
29
+ "model = AutoModel.from_pretrained(model_name)\n",
30
+ "\n",
31
+ "# Загрузка данных из CSV\n",
32
+ "df = pd.read_csv('final_data.csv')\n",
33
+ "\n",
34
+ "# Максимальная длина текста\n",
35
+ "MAX_LEN = 300\n",
36
+ "\n",
37
+ "# Функция для встраивания текста с использованием BERT\n",
38
+ "def embed_bert_cls(text, model=model, tokenizer=tokenizer):\n",
39
+ " t = tokenizer(text,\n",
40
+ " padding=True,\n",
41
+ " truncation=True,\n",
42
+ " return_tensors='pt',\n",
43
+ " max_length=MAX_LEN)\n",
44
+ " with torch.no_grad():\n",
45
+ " model_output = model(**{k: v.to(model.device) for k, v in t.items()})\n",
46
+ " embeddings = model_output.last_hidden_state[:, 0, :]\n",
47
+ " embeddings = torch.nn.functional.normalize(embeddings)\n",
48
+ " return embeddings[0].cpu().squeeze()\n",
49
+ "\n",
50
+ "# Загрузка предварительно вычисленных векторов\n",
51
+ "embeddings = np.loadtxt('embeddings.txt')\n",
52
+ "embeddings_tensor = [torch.tensor(embedding) for embedding in embeddings]\n",
53
+ "\n",
54
+ "# Создание индекса Faiss\n",
55
+ "embeddings_matrix = np.stack(embeddings)\n",
56
+ "index = faiss.IndexFlatIP(embeddings_matrix.shape[1])\n",
57
+ "index.add(embeddings_matrix)\n",
58
+ "\n",
59
+ "# Текст запроса\n",
60
+ "text = 'добрую сказку с плохим концом для детей'\n",
61
+ "\n",
62
+ "# Встраивание запроса и поиск ближайших векторов с использованием Faiss\n",
63
+ "query_embedding = embed_bert_cls(text)\n",
64
+ "query_embedding = query_embedding.numpy().astype('float32')\n",
65
+ "k, indices = index.search(np.expand_dims(query_embedding, axis=0), 3)\n",
66
+ "\n",
67
+ "# Вывод результатов\n",
68
+ "for i in indices[0]:\n",
69
+ " print(df['title'][i])"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": null,
75
+ "id": "c0aa7ef2-7f93-4300-9555-047bbc6c1036",
76
+ "metadata": {},
77
+ "outputs": [],
78
+ "source": []
79
+ }
80
+ ],
81
+ "metadata": {
82
+ "kernelspec": {
83
+ "display_name": "Python 3 (ipykernel)",
84
+ "language": "python",
85
+ "name": "python3"
86
+ },
87
+ "language_info": {
88
+ "codemirror_mode": {
89
+ "name": "ipython",
90
+ "version": 3
91
+ },
92
+ "file_extension": ".py",
93
+ "mimetype": "text/x-python",
94
+ "name": "python",
95
+ "nbconvert_exporter": "python",
96
+ "pygments_lexer": "ipython3",
97
+ "version": "3.11.3"
98
+ }
99
+ },
100
+ "nbformat": 4,
101
+ "nbformat_minor": 5
102
+ }