{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "da5094d4-73fa-4e6c-89a1-0639709d9bc0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence: This framework generates embeddings for each input sentence\n",
      "Embedding: <class 'numpy.ndarray'> 384\n",
      "\n",
      "Sentence: Sentences are passed as a list of string.\n",
      "Embedding: <class 'numpy.ndarray'> 384\n",
      "\n",
      "Sentence: The quick brown fox jumps over the lazy dog.\n",
      "Embedding: <class 'numpy.ndarray'> 384\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "model = SentenceTransformer('all-MiniLM-L6-v2')\n",
    "\n",
    "#Our sentences we like to encode\n",
    "sentences = ['This framework generates embeddings for each input sentence',\n",
    "    'Sentences are passed as a list of string.',\n",
    "    'The quick brown fox jumps over the lazy dog.']\n",
    "\n",
    "#Sentences are encoded by calling model.encode()\n",
    "embeddings = model.encode(sentences)\n",
    "\n",
    "#Print the embeddings\n",
    "for sentence, embedding in zip(sentences, embeddings):\n",
    "    print(\"Sentence:\", sentence)\n",
    "    print(\"Embedding:\", type(embedding), embedding.size)\n",
    "    print(\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9f519c4d-1a1a-4f74-801d-2bb9e4e14e3a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cosine-Similarity: tensor([[0.6153]])\n"
     ]
    }
   ],
   "source": [
    "\n",
    "from sentence_transformers import SentenceTransformer, util\n",
    "model = SentenceTransformer('all-MiniLM-L6-v2')\n",
    "\n",
    "#Sentences are encoded by calling model.encode()\n",
    "emb1 = model.encode(\"This is a red cat with a hat.\")\n",
    "emb2 = model.encode(\"Have you seen my red cat?\")\n",
    "\n",
    "cos_sim = util.cos_sim(emb1, emb2)\n",
    "print(\"Cosine-Similarity:\", cos_sim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "74e2bf51-6e6d-4d80-8449-6c7d168d561a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Top-5 most similar pairs:\n",
      "A man is eating food. \t A man is eating a piece of bread. \t 0.7553\n",
      "A man is riding a horse. \t A man is riding a white horse on an enclosed ground. \t 0.7369\n",
      "A monkey is playing drums. \t Someone in a gorilla costume is playing a set of drums. \t 0.6433\n",
      "A woman is playing violin. \t Someone in a gorilla costume is playing a set of drums. \t 0.2564\n",
      "A man is eating food. \t A man is riding a horse. \t 0.2474\n"
     ]
    }
   ],
   "source": [
    "from sentence_transformers import SentenceTransformer, util\n",
    "model = SentenceTransformer('all-MiniLM-L6-v2')\n",
    "\n",
    "sentences = ['A man is eating food.',\n",
    "          'A man is eating a piece of bread.',\n",
    "          'The girl is carrying a baby.',\n",
    "          'A man is riding a horse.',\n",
    "          'A woman is playing violin.',\n",
    "          'Two men pushed carts through the woods.',\n",
    "          'A man is riding a white horse on an enclosed ground.',\n",
    "          'A monkey is playing drums.',\n",
    "          'Someone in a gorilla costume is playing a set of drums.'\n",
    "          ]\n",
    "\n",
    "#Encode all sentences\n",
    "embeddings = model.encode(sentences)\n",
    "\n",
    "#Compute cosine similarity between all pairs\n",
    "cos_sim = util.cos_sim(embeddings, embeddings)\n",
    "\n",
    "#Add all pairs to a list with their cosine similarity score\n",
    "all_sentence_combinations = []\n",
    "for i in range(len(cos_sim)-1):\n",
    "    for j in range(i+1, len(cos_sim)):\n",
    "        all_sentence_combinations.append([cos_sim[i][j], i, j])\n",
    "\n",
    "#Sort list by the highest cosine similarity score\n",
    "all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)\n",
    "\n",
    "print(\"Top-5 most similar pairs:\")\n",
    "for score, i, j in all_sentence_combinations[0:5]:\n",
    "    print(\"{} \\t {} \\t {:.4f}\".format(sentences[i], sentences[j], cos_sim[i][j]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1ae46dd-1c19-4385-85b3-ec8f13dc6fe5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ae4f9f4-b9dd-440e-86ec-7ec1ba7166e7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a00a4b61-3e9e-4e92-aa4e-c972b78bfcb8",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "07a67248-1f90-4163-98e5-3daf612686d1",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "989ebf4f-1078-4431-b7d7-95d0470b86b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer\n",
    "model = SentenceTransformer('all-distilroberta-v1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "77ddfd4f-cdf9-4193-a479-d2d2ef86d780",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence: This framework generates embeddings for each input sentence\n",
      "Embedding: <class 'numpy.ndarray'> 768\n",
      "\n",
      "Sentence: Sentences are passed as a list of string.\n",
      "Embedding: <class 'numpy.ndarray'> 768\n",
      "\n",
      "Sentence: The quick brown fox jumps over the lazy dog.\n",
      "Embedding: <class 'numpy.ndarray'> 768\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentences = ['This framework generates embeddings for each input sentence',\n",
    "    'Sentences are passed as a list of string.',\n",
    "    'The quick brown fox jumps over the lazy dog.']\n",
    "\n",
    "embeddings = model.encode(sentences)\n",
    "\n",
    "for sentence, embedding in zip(sentences, embeddings):\n",
    "    print(\"Sentence:\", sentence)\n",
    "    print(\"Embedding:\", type(embedding), embedding.size)\n",
    "    print(\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff56d32d-9046-41d6-bb92-ac08a176faf2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8de0b905-99ad-4b12-8aa8-76cd2cad8252",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27a8ed1b-47e0-4de1-b9fb-8e939efff368",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3948830e-5ce7-4d97-9f26-eec9904671e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer, models\n",
    "\n",
    "word_embedding_model = models.Transformer('distilroberta-base')\n",
    "\n",
    "## Step 2: use a pool function over the token embeddings\n",
    "pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())\n",
    "\n",
    "## Join steps 1 and 2 using the modules argument\n",
    "model = SentenceTransformer(modules=[word_embedding_model, pooling_model])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}