{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "Q-bj6K7Qv4ft" }, "source": [ "# Fine-Tuning a Generative Pretrained Transformer (`GPT`)\n", "\n", "1. Install required libraries." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SBWCrz5GfBXo", "outputId": "64731371-7825-4004-d434-a959eb03d392" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting transformers\n", " Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m100.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting datasets\n", " Downloading datasets-2.13.1-py3-none-any.whl (486 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m486.2/486.2 kB\u001b[0m \u001b[31m48.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting codecarbon\n", " Downloading codecarbon-2.2.4-py3-none-any.whl (176 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m176.0/176.0 kB\u001b[0m \u001b[31m21.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n", "Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)\n", " Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.8/236.8 kB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)\n", " Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m106.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting safetensors>=0.3.1 (from transformers)\n", " Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m82.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n", "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (9.0.0)\n", "Collecting dill<0.3.7,>=0.3.0 (from datasets)\n", " Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n", "Collecting xxhash (from datasets)\n", " Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.5/212.5 kB\u001b[0m \u001b[31m26.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting multiprocess (from datasets)\n", " Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.3/134.3 kB\u001b[0m \u001b[31m15.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.8.4)\n", "Collecting arrow (from codecarbon)\n", " Downloading arrow-1.2.3-py3-none-any.whl (66 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.4/66.4 kB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting pynvml (from codecarbon)\n", " Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from codecarbon) (5.9.5)\n", "Requirement already satisfied: py-cpuinfo in /usr/local/lib/python3.10/dist-packages (from codecarbon) (9.0.0)\n", "Collecting fuzzywuzzy (from codecarbon)\n", " Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from codecarbon) (8.1.3)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n", "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.0.12)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.2)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.2)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.3)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.6.3)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.16)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.5.7)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: python-dateutil>=2.7.0 in /usr/local/lib/python3.10/dist-packages (from arrow->codecarbon) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2022.7.1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7.0->arrow->codecarbon) (1.16.0)\n", "Installing collected packages: tokenizers, safetensors, fuzzywuzzy, xxhash, pynvml, dill, multiprocess, huggingface-hub, arrow, transformers, codecarbon, datasets\n", "Successfully installed arrow-1.2.3 codecarbon-2.2.4 datasets-2.13.1 dill-0.3.6 fuzzywuzzy-0.18.0 huggingface-hub-0.15.1 multiprocess-0.70.14 pynvml-11.5.0 safetensors-0.3.1 tokenizers-0.13.3 transformers-4.30.2 xxhash-3.2.0\n" ] } ], "source": [ "!pip install transformers datasets codecarbon" ] }, { "cell_type": "markdown", "metadata": { "id": "y5XnfvSH7w4z" }, "source": [ "2. Load the data from the hub." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 702, "referenced_widgets": [ "499828942ae14c23a9f41ef0f78f8533", "ec89751e139849c6aaf8b7e05299b000", "42f677beb8c64a7897a0c53388f45584", "daf13fbfa6c340a19e37fb13aba97715", "dda15bc3e167466fb36f3aa0a4145aee", "add506b462f34c5d89f9ed90f262f480", "f4ac5a3c7ecb470396880f609cb38db6", "f28a3ead21884938808e10a28d360850", "b852960402c247448148834d20218026", "69dca2226c1c4a7f8fabca2cbee30e02", "55a1d4de4f6b43a2aed6f71d553d5803", "2c412a960e6d4aaab0f875fcfea66fd2", "d7a07ab3ed6b4a24b4254a83a0a33a71", "d8592503407e4bac8f809022e9deab9c", "453a53fd858c43728a9abdab249b4d2c", "a36ac5f030bc45178acd419ea0a16801", "d91e19d561de437f984a9823106d5004", "d402d074ce134089b32f449b41104420", "23e34d91b4214683b88bc4147db52eb2", "bec1815e28084a958f520c05812dee31", "7a347ed460074cdcad387723de77412a", "3a68bc3657ed4875b88708adeb206955", "40fc99a194664eb799841d81c167baae", "0e93d173427e40eab332d6535971c493", "576ceed5d79b4ee19db37c4b0645c0b0", "8fda4938c19c4655a656792c5ba45b42", "c2f4ce7929944893aa5cdd4393424887", "510466e0ccc6405e8fb24c1a2b664665", "b7fdc64bfb244bb0850217790849982a", "eb8b286a448d4fe29bb34f1e73157091", "397914355d8a48fda027c14ce9ee20b4", "f18c2177eba146e891b92164bffac246", "e3f9d5fe121d4421b21734b491e42bf2", "05cfe1e827cd4467b9b6c58ff261f65f", "7a22f8b6073b41dcad21a486d0e83f33", "4eaec6464e3742a78acabba5fdd21c5a", "b992866dfb3a4cdebf3bc2df602db67b", "70ab003a55ae4a8589a77c7a2d76f7c1", "209035c37b3c4082906817916e87cb14", "6354cd0a18234885b0857ea5b1b4f275", "7cbe75d820e546beb8de7174cb407d7e", "241d69e4595945398660f62adceeb973", "c504177e4759430dbed6b29a5d138d47", "4a8bec7ad010429f98d22b69141523c7", "304ff2d7cc184be4995e8057a852dbf2", "01541824468c4657bb9f42c028b46caf", "8c82bf08d2b54dac9ea93721f6a79551", "be3924e65b51404e83f480bcbda05a3c", "c8cab56ef56746f2ab1c61b6ba53891b", "090339befa624ef099b6af328c74bb8b", "a4ec4c0d560f4da5997c2e965f2d4d1e", "e582688a65f14e6fb9f66c7f41cbb4b8", "befd8a115aa3438f92a8b13086c5901f", "7f6c370261e8451c9e82ed1a57c7f3a1", "000bf7ced5054f408728051421ffa4e8", "e1f01a28fbdf4b31b0a69057c919ca30", "da2a09ab872e4d99acc94b8220e5db48", "02863f093a864c6b968b2ade368cf6f5", "fccba7b3b66245aa898ca64b225b41a4", "f39128f43d6f40339b957a863f676fe5", "534b0ff27317493fab835c2e549d1006", "8c17091e4dbd426f9d4eca435a7c0c23", "82a810d53f5d4b33b6cfdb8ab3c1670f", "ebc92b1b87624a24a0d4fcdb75e0b1a7", "a3caedf1b1d04d938098ba17c8c65c08", "c1ed25d1d6f54a0eb1ffebcd89b735ef", "a2acc2cea0894f0992895b6cd6c29af1", "0a6ac7b204234a8e9d16ae22c798780e", "0a276f94f81e4371910223d2f60ba853", "cb957370e9f94f42a6745969b326a0cf", "9ef327ca3e834d3ea1a019a58c7c6a2e", "dd8b9594d93f44efa3f6331aed4e527c", "d0b8ccc83cae4906b3984acf6798bf79", "c104d58d92394210bf70d1fe86f0071b", "144215087c544088ba301a3e78c5294a", "6867d2189fe74b3d8d46dddf8f0ae315", "bdeca98643744a45962d0d35e52ece3c", "a8b614b0611f4482b060fa8fedeaf361", "5a5fa87dfb4f4e6b994fc2a93a57b850", "8fc40c6084d2444d9c41f693b08275a3", "d804c224cfed4d7490da5eb1b800a6f0", "6b93dd6bbe904badac1caa3f94673ec2", "060f032bd8194f199a7ac3cafc650e45", "87026c94b0af4cf3b1b76ecb91bb09b0", "ce765eaeec1f4e1ab38c66144accefdb", "b8bd79e3827e4e53aafb08b3fe944873", "70e390873ce74d0abfdd993dc9d8c8e3", "4005e322e0db4b72bbc5c1043e26d9d5", "18043fe7b91743d6a8638e18cfbeb1b1", "7705d2f335e9484a9a865af700f44edb", "a94b0704e84a40e98e9bc1fd6c0dfb16", "2e1765744d55460aac76d68155ab35cf", "0af7240971214f3598ae275dda58508c", "aee2783b5b48469fb4e928b7bf554086", "7dcbf9861b4e4184a2bb2e8d85f76357", "f319d2a5cd99440a955105789c065b55", "339f16c741d546e68304d0ef1ecbecd8", "e201065c2fda4373ad47af08f8622527", "b8e5d3972b2e4e278ba89eb7d56ebe98", "e9a5a4e11d0a4070860177f3445309fa", "4fa725988f30434287f04c1e20d23eef", "99ba3a06396542c0a493019cfb9d85ef", "6e40de89a4bb4ac9a909cab4e5d04d58", "42305c082c554342a71ed4a58872670b", "0a4927f8a5324aba895b30bd09d0246b", "72ce3f3f5a1d452e829d33f9cf56d20d", "f7b951ab31384c929f89016b7fa8fa5b", "ffdbc31818ff4f84b7834085f83b8a97", "0d4925074da342dd9afb090844832d80", "29df5475b688405d95f7aa9692f713af", "aae7653e892e49e9ab3b0b2bc6554ce0", "2e1e8d07b68845258214d1ca832d46a3", "e22bc9a05e564462a8487fb582e00086", "aefabfc2ce2c49f0b8c04786247b0a7a", "3e465184198e418e9b83e4673524b29b", "bf7697082d004d43aef13151cb3d0c24", "f902f616850c4eedb39d43e1c64ab5c7", "4cd4a013f4c84efebcf95f537b79f4c4", "e3b923d39d3d470e83cfd4b7c3537791", "c37cd9052eab451e94eecc53c73f1b87", "ba306741e6434b8690de8026dfde3c39" ] }, "id": "7MbpXGu-v4f1", "outputId": "7097a22c-ca67-4cab-d370-e65479eddf71" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "499828942ae14c23a9f41ef0f78f8533", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading readme: 0%| | 0.00/5.12k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/nicholasKluge___parquet/nicholasKluge--fine-tuning-instruct-aira-4077fd700c38fc36/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2c412a960e6d4aaab0f875fcfea66fd2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/4 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "40fc99a194664eb799841d81c167baae", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/1.71M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "05cfe1e827cd4467b9b6c58ff261f65f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/13.5M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "304ff2d7cc184be4995e8057a852dbf2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/14.5M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e1f01a28fbdf4b31b0a69057c919ca30", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/1.84M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a2acc2cea0894f0992895b6cd6c29af1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Extracting data files: 0%| | 0/4 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a8b614b0611f4482b060fa8fedeaf361", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_english split: 0%| | 0/9183 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "18043fe7b91743d6a8638e18cfbeb1b1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_instruct_english split: 0%| | 0/48666 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e9a5a4e11d0a4070860177f3445309fa", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_instruct_portuguese split: 0%| | 0/48571 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "aae7653e892e49e9ab3b0b2bc6554ce0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_portuguese split: 0%| | 0/9204 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/nicholasKluge___parquet/nicholasKluge--fine-tuning-instruct-aira-4077fd700c38fc36/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " | prompt | \n", "completion | \n", "
---|---|---|
0 | \n", "What is the best way to find cheap flights to ... | \n", "The best way to find cheap flights to New York... | \n", "
1 | \n", "What is the best way to communicate in a remot... | \n", "The best way to communicate in a remote workpl... | \n", "
2 | \n", "What are the best books to read if I want to l... | \n", "The best books to read if you want to learn ab... | \n", "
3 | \n", "Give me a list of what to do with used brake r... | \n", "Scrap them for their raw metal value. Use the... | \n", "
4 | \n", "How do I get rid of bedbugs. | \n", "Start by thoroughly cleaning your home. Vacuum... | \n", "
... | \n", "... | \n", "... | \n", "
48661 | \n", "How do I get stains out of my carpet. | \n", "The best way to get stains out of your carpet ... | \n", "
48662 | \n", "I would like to make an omelet for my dinner. ... | \n", "Crack two eggs into a bowl.\\n2. Whisk the eggs... | \n", "
48663 | \n", "What are the best restaurants in San Francisco. | \n", "The top rated restaurants in San Francisco inc... | \n", "
48664 | \n", "What is the best way to clean a leather sofa. | \n", "The best way to clean a leather sofa is to use... | \n", "
48665 | \n", "Why do people have children? | \n", "There are many reasons people have children. ... | \n", "
48666 rows × 2 columns
\n", "