{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Import Libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from sentence_transformers import SentenceTransformer" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from datasets import load_from_disk\n", "import os" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "base_path = os.getcwd()\n", "dataset_folder = \"conala\"\n", "path_to_dataset = os.path.join(base_path,dataset_folder)\n", "dataset = load_from_disk(path_to_dataset)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create vector embedding" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/swastikm/work/senetence-transformer-in-action-main/semantic_search_env/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] } ], "source": [ "model = SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def get_embeddings(examples):\n", " vectors = {}\n", " model_input = examples['intent']\n", " out = model.encode(model_input)\n", " vectors['embedding'] = out\n", " return vectors" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Map: 100%|██████████| 593891/593891 [20:14<00:00, 488.98 examples/s]\n" ] } ], "source": [ "vectorized_dataset = dataset.map(get_embeddings,batched=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['question_id', 'parent_answer_post_id', 'prob', 'snippet', 'intent', 'id', 'embedding'],\n", " num_rows: 593891\n", "})" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vectorized_dataset" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Saving the dataset (3/3 shards): 100%|██████████| 593891/593891 [00:08<00:00, 69697.72 examples/s] \n" ] } ], "source": [ "vectorized_dataset.save_to_disk('vectorized_dataset')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "semactic_env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }