{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "da5094d4-73fa-4e6c-89a1-0639709d9bc0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence: This framework generates embeddings for each input sentence\n", "Embedding: 384\n", "\n", "Sentence: Sentences are passed as a list of string.\n", "Embedding: 384\n", "\n", "Sentence: The quick brown fox jumps over the lazy dog.\n", "Embedding: 384\n", "\n" ] } ], "source": [ "from sentence_transformers import SentenceTransformer\n", "model = SentenceTransformer('all-MiniLM-L6-v2')\n", "\n", "#Our sentences we like to encode\n", "sentences = ['This framework generates embeddings for each input sentence',\n", " 'Sentences are passed as a list of string.',\n", " 'The quick brown fox jumps over the lazy dog.']\n", "\n", "#Sentences are encoded by calling model.encode()\n", "embeddings = model.encode(sentences)\n", "\n", "#Print the embeddings\n", "for sentence, embedding in zip(sentences, embeddings):\n", " print(\"Sentence:\", sentence)\n", " print(\"Embedding:\", type(embedding), embedding.size)\n", " print(\"\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "9f519c4d-1a1a-4f74-801d-2bb9e4e14e3a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cosine-Similarity: tensor([[0.6153]])\n" ] } ], "source": [ "\n", "from sentence_transformers import SentenceTransformer, util\n", "model = SentenceTransformer('all-MiniLM-L6-v2')\n", "\n", "#Sentences are encoded by calling model.encode()\n", "emb1 = model.encode(\"This is a red cat with a hat.\")\n", "emb2 = model.encode(\"Have you seen my red cat?\")\n", "\n", "cos_sim = util.cos_sim(emb1, emb2)\n", "print(\"Cosine-Similarity:\", cos_sim)" ] }, { "cell_type": "code", "execution_count": 3, "id": "74e2bf51-6e6d-4d80-8449-6c7d168d561a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Top-5 most similar pairs:\n", "A man is eating food. \t A man is eating a piece of bread. \t 0.7553\n", "A man is riding a horse. \t A man is riding a white horse on an enclosed ground. \t 0.7369\n", "A monkey is playing drums. \t Someone in a gorilla costume is playing a set of drums. \t 0.6433\n", "A woman is playing violin. \t Someone in a gorilla costume is playing a set of drums. \t 0.2564\n", "A man is eating food. \t A man is riding a horse. \t 0.2474\n" ] } ], "source": [ "from sentence_transformers import SentenceTransformer, util\n", "model = SentenceTransformer('all-MiniLM-L6-v2')\n", "\n", "sentences = ['A man is eating food.',\n", " 'A man is eating a piece of bread.',\n", " 'The girl is carrying a baby.',\n", " 'A man is riding a horse.',\n", " 'A woman is playing violin.',\n", " 'Two men pushed carts through the woods.',\n", " 'A man is riding a white horse on an enclosed ground.',\n", " 'A monkey is playing drums.',\n", " 'Someone in a gorilla costume is playing a set of drums.'\n", " ]\n", "\n", "#Encode all sentences\n", "embeddings = model.encode(sentences)\n", "\n", "#Compute cosine similarity between all pairs\n", "cos_sim = util.cos_sim(embeddings, embeddings)\n", "\n", "#Add all pairs to a list with their cosine similarity score\n", "all_sentence_combinations = []\n", "for i in range(len(cos_sim)-1):\n", " for j in range(i+1, len(cos_sim)):\n", " all_sentence_combinations.append([cos_sim[i][j], i, j])\n", "\n", "#Sort list by the highest cosine similarity score\n", "all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)\n", "\n", "print(\"Top-5 most similar pairs:\")\n", "for score, i, j in all_sentence_combinations[0:5]:\n", " print(\"{} \\t {} \\t {:.4f}\".format(sentences[i], sentences[j], cos_sim[i][j]))" ] }, { "cell_type": "code", "execution_count": null, "id": "a1ae46dd-1c19-4385-85b3-ec8f13dc6fe5", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "2ae4f9f4-b9dd-440e-86ec-7ec1ba7166e7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a00a4b61-3e9e-4e92-aa4e-c972b78bfcb8", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "07a67248-1f90-4163-98e5-3daf612686d1", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 31, "id": "989ebf4f-1078-4431-b7d7-95d0470b86b0", "metadata": {}, "outputs": [], "source": [ "from sentence_transformers import SentenceTransformer\n", "model = SentenceTransformer('all-distilroberta-v1')" ] }, { "cell_type": "code", "execution_count": 32, "id": "77ddfd4f-cdf9-4193-a479-d2d2ef86d780", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence: This framework generates embeddings for each input sentence\n", "Embedding: 768\n", "\n", "Sentence: Sentences are passed as a list of string.\n", "Embedding: 768\n", "\n", "Sentence: The quick brown fox jumps over the lazy dog.\n", "Embedding: 768\n", "\n" ] } ], "source": [ "sentences = ['This framework generates embeddings for each input sentence',\n", " 'Sentences are passed as a list of string.',\n", " 'The quick brown fox jumps over the lazy dog.']\n", "\n", "embeddings = model.encode(sentences)\n", "\n", "for sentence, embedding in zip(sentences, embeddings):\n", " print(\"Sentence:\", sentence)\n", " print(\"Embedding:\", type(embedding), embedding.size)\n", " print(\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ff56d32d-9046-41d6-bb92-ac08a176faf2", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "8de0b905-99ad-4b12-8aa8-76cd2cad8252", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "27a8ed1b-47e0-4de1-b9fb-8e939efff368", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3948830e-5ce7-4d97-9f26-eec9904671e4", "metadata": {}, "outputs": [], "source": [ "from sentence_transformers import SentenceTransformer, models\n", "\n", "word_embedding_model = models.Transformer('distilroberta-base')\n", "\n", "## Step 2: use a pool function over the token embeddings\n", "pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())\n", "\n", "## Join steps 1 and 2 using the modules argument\n", "model = SentenceTransformer(modules=[word_embedding_model, pooling_model])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }