{ "cells": [ { "cell_type": "code", "execution_count": 12, "id": "23879688", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a7147b7318214d9da894766e55a4a895", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)okenizer_config.json: 0%| | 0.00/226 [00:00', 'pad_token': '', 'additional_special_tokens': ['', '', '', '']}, clean_up_tokenization_spaces=True), added_tokens_decoder={\n", "\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t3: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t4: AddedToken(\"[START_REF]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t5: AddedToken(\"[END_REF]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t6: AddedToken(\"[IMAGE]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t7: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t8: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t9: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t10: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t11: AddedToken(\"[START_SUP]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t12: AddedToken(\"[END_SUP]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t13: AddedToken(\"[START_SUB]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t14: AddedToken(\"[END_SUB]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t15: AddedToken(\"[START_DNA]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t16: AddedToken(\"[END_DNA]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t17: AddedToken(\"[START_AMINO]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t18: AddedToken(\"[END_AMINO]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t19: AddedToken(\"[START_SMILES]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t20: AddedToken(\"[END_SMILES]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t21: AddedToken(\"[START_I_SMILES]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t22: AddedToken(\"[END_I_SMILES]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t50000: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t50001: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t50002: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t50003: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "}" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "PreTrainedTokenizerFast(name_or_path='theblackcat102/galactica-1.3b-v2', vocab_size=50000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '', 'pad_token': '', 'additional_special_tokens': ['', '', '', '']}, clean_up_tokenization_spaces=True), added_tokens_decoder={\n", "\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t3: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t4: AddedToken(\"[START_REF]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t5: AddedToken(\"[END_REF]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t6: AddedToken(\"[IMAGE]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t7: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t8: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t9: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t10: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t11: AddedToken(\"[START_SUP]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t12: AddedToken(\"[END_SUP]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t13: AddedToken(\"[START_SUB]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t14: AddedToken(\"[END_SUB]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t15: AddedToken(\"[START_DNA]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t16: AddedToken(\"[END_DNA]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t17: AddedToken(\"[START_AMINO]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t18: AddedToken(\"[END_AMINO]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t19: AddedToken(\"[START_SMILES]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t20: AddedToken(\"[END_SMILES]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t21: AddedToken(\"[START_I_SMILES]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t22: AddedToken(\"[END_I_SMILES]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t50000: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t50001: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t50002: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "\t50003: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", "}" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer" ] }, { "cell_type": "code", "execution_count": 46, "id": "695b08bf", "metadata": {}, "outputs": [], "source": [ "text = 'We connect the four chains using SequentialChain. The output of one chain becomes the input to the next chain.'" ] }, { "cell_type": "code", "execution_count": 47, "id": "976a0ffc", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'input_ids': [1246, 12775, 286, 1715, 7067, 672, 22319, 40118, 36, 381, 2380, 299, 717, 2764, 3778, 286, 1964, 321, 286, 2857, 2764, 36], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "{'input_ids': [1246, 12775, 286, 1715, 7067, 672, 22319, 40118, 36, 381, 2380, 299, 717, 2764, 3778, 286, 1964, 321, 286, 2857, 2764, 36], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenized = tokenizer(text)\n", "tokenized" ] }, { "cell_type": "code", "execution_count": 48, "id": "5004a189", "metadata": {}, "outputs": [], "source": [ "# dir(tokenized)" ] }, { "cell_type": "code", "execution_count": 62, "id": "e80ede4a", "metadata": {}, "outputs": [], "source": [ "tokens = tokenizer.encode(text, return_tensors='pt')\n", "# print(\"These are tokens!\", tokens)\n", "# tokens, tokenized['input_ids']\n", "# for token in tokens[0]:\n", "# print(\"This are decoded tokens!\", tokenizer.decode([token]))" ] }, { "cell_type": "code", "execution_count": 54, "id": "0fcbc2ce", "metadata": {}, "outputs": [], "source": [ "model = AutoModelForCausalLM.from_pretrained(model_name)\n", "# print(model.embeddings.word_embeddings(tokens))\n", "# for e in model.embeddings.word_embeddings(tokens)[0]:\n", "# print(\"This is an embedding!\", e)" ] }, { "cell_type": "code", "execution_count": 55, "id": "a6c31367", "metadata": {}, "outputs": [], "source": [ "# dir(model)" ] }, { "cell_type": "code", "execution_count": 67, "id": "f6be2cdf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([ 0.0139, -0.0367, 0.0186, -0.0564, -0.0004, -0.0255, -0.0011, -0.0179,\n", " -0.0128, -0.0046, -0.0361, -0.0222, 0.0443, 0.0058, -0.0008, 0.0186,\n", " -0.0252, -0.0082, 0.0186, -0.0195, 0.0058, -0.0128],\n", " grad_fn=)" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "tensor([ 0.0139, -0.0367, 0.0186, -0.0564, -0.0004, -0.0255, -0.0011, -0.0179,\n", " -0.0128, -0.0046, -0.0361, -0.0222, 0.0443, 0.0058, -0.0008, 0.0186,\n", " -0.0252, -0.0082, 0.0186, -0.0195, 0.0058, -0.0128],\n", " grad_fn=)" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "embs = [emb for emb in model.get_input_embeddings().parameters()]\n", "embs[0][tokenized['input_ids'], 0]" ] }, { "cell_type": "code", "execution_count": null, "id": "b4be6e5c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "511fb260", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "705ff113", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "df88b11fada24110bdaca104aecc5a3f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/2 [00:00