{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']\n", "- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([4, 768])\n" ] } ], "source": [ "from multilingual_clip import pt_multilingual_clip\n", "import transformers\n", "\n", "texts = [\n", " 'Three blind horses listening to Mozart.',\n", " 'Älgen är skogens konung!',\n", " 'Wie leben Eisbären in der Antarktis?',\n", " 'Вы знали, что все белые медведи левши?'\n", "]\n", "model_name = 'M-CLIP/XLM-Roberta-Large-Vit-L-14'\n", "\n", "# Load Model & Tokenizer\n", "model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(model_name)\n", "tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)\n", "\n", "embeddings = model.forward(texts, tokenizer)\n", "print(embeddings.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "texts = [\n", " 'Aku sayang kamu',\n", " 'Aku benci kamu',\n", "]\n", "embeddings = model.forward(texts, tokenizer)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "embeddings_1, embeddings_2 = embeddings\n", "embeddings_1 = embeddings_1.cpu().detach().numpy()\n", "embeddings_2 = embeddings_2.cpu().detach().numpy()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from numpy.linalg import norm" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.967305\n" ] } ], "source": [ "cosine = np.dot(embeddings_1,embeddings_2)/(norm(embeddings_1)*norm(embeddings_2))\n", "print(cosine)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "ad2bdc8ecc057115af97d19610ffacc2b4e99fae6737bb82f5d7fb13d2f2c186" } } }, "nbformat": 4, "nbformat_minor": 2 }