{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "Q-bj6K7Qv4ft" }, "source": [ "# Fine-Tuning a Generative Pretrained Transformer (`GPT`)\n", "\n", "1. Install required libraries." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SBWCrz5GfBXo", "outputId": "74ee04dc-9602-4117-f90f-29520eb8c277" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting transformers\n", " Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m60.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting datasets\n", " Downloading datasets-2.13.1-py3-none-any.whl (486 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m486.2/486.2 kB\u001b[0m \u001b[31m44.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting codecarbon\n", " Downloading codecarbon-2.2.4-py3-none-any.whl (176 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m176.0/176.0 kB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n", "Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)\n", " Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.8/236.8 kB\u001b[0m \u001b[31m26.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)\n", " Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m104.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting safetensors>=0.3.1 (from transformers)\n", " Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m79.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n", "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (9.0.0)\n", "Collecting dill<0.3.7,>=0.3.0 (from datasets)\n", " Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n", "Collecting xxhash (from datasets)\n", " Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.5/212.5 kB\u001b[0m \u001b[31m27.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting multiprocess (from datasets)\n", " Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.3/134.3 kB\u001b[0m \u001b[31m15.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.8.4)\n", "Collecting arrow (from codecarbon)\n", " Downloading arrow-1.2.3-py3-none-any.whl (66 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.4/66.4 kB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting pynvml (from codecarbon)\n", " Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from codecarbon) (5.9.5)\n", "Requirement already satisfied: py-cpuinfo in /usr/local/lib/python3.10/dist-packages (from codecarbon) (9.0.0)\n", "Collecting fuzzywuzzy (from codecarbon)\n", " Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from codecarbon) (8.1.3)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n", "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.0.12)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.2)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.2)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.3)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.6.3)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.16)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.5.7)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: python-dateutil>=2.7.0 in /usr/local/lib/python3.10/dist-packages (from arrow->codecarbon) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2022.7.1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7.0->arrow->codecarbon) (1.16.0)\n", "Installing collected packages: tokenizers, safetensors, fuzzywuzzy, xxhash, pynvml, dill, multiprocess, huggingface-hub, arrow, transformers, codecarbon, datasets\n", "Successfully installed arrow-1.2.3 codecarbon-2.2.4 datasets-2.13.1 dill-0.3.6 fuzzywuzzy-0.18.0 huggingface-hub-0.15.1 multiprocess-0.70.14 pynvml-11.5.0 safetensors-0.3.1 tokenizers-0.13.3 transformers-4.30.2 xxhash-3.2.0\n" ] } ], "source": [ "!pip install transformers datasets codecarbon" ] }, { "cell_type": "markdown", "metadata": { "id": "y5XnfvSH7w4z" }, "source": [ "2. Load the data from the hub." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 702, "referenced_widgets": [ "34c17806228640a99cb0aca8cf97fcb8", "20b2e67cec4840de904c7ad288e5fe7d", "a8e41c0c8d944cd5b07f94c45c1be7a5", "9eb9df0ec0b140409c5cd95c41a064df", "e119d4366a6f4fa2bbd4aafc99ffc30a", "dbf04697ee00430197693128526b8e53", "228464ae9ad942b3a898dba94606df85", "32291d2624514176b1f442d893fe1e6d", "441a7d9d5dce47aa8ebd65c34ac888c4", "2ff0730fbf374d21a141c900efb79236", "9d43ff6f9d9c4c359dbcf38aabef576d", "8b814a900b904583bc2443d454d05ac6", "307e38974aa744469ce3f5ba0e8b6f61", "950ed03e99ea48369bdd87289c6339e4", "bfd29522420a44a58574a64830d3e003", "368e0e80fa5c47438cc8012a7c8fa53a", "8fcd3480fb074c51a5f8e83f08aa5c61", "89ef691962474a61a0d8923c8a9e9124", "aefd83b855dd4f93a0d3e53c0b8d13f0", "8d1e1eb40cc74ed7ab182e2b34de3811", "cf79c3560d5a4e85b6a0eb368f1a3132", "5968b3dd30d245eab475d32cc9d54a68", "4947f03b2b88428e92c32b49baa9ed2c", "74573cda9b53432da01043d533aaa5d6", "f2dd88c1446b48f1801029a5191c6aac", "bed42573c6aa445287a0888274050cf8", "095de275ce96496492d047904bb397bc", "c3b50aab3b284944bd3081b7817451ca", "6b0e65b893d14e3890a8b325e1b30564", "63f2c79a3c9b4abea3f069ef2420b4f9", "5a3855fb89f5451d95bf3a4dd4fefc56", "54d48bff94c2492c8eff461dbad6b89a", "6c3493b589504e699522362ac8d81ec5", "401d3f247c564de5b63a207e4649c075", "fd8de013a791476cbf192d278809280a", "055947b2fad240fe953311212cb031ff", "42a8ecb023d94acbb7297adc07603abe", "1621ef33d6a54b5a8f0137d3b0ecff8a", "16fea66468f646c6b8cdb398f4fb100c", "f724eaf1e4fa468398461c2f647c07ee", "e3f22b954ead41069ca2eb40eab2317b", "ad006b98a81c4b6ca9c249c10486ea5b", "5c26f8ecc4974fb9b702ff68703f1f98", "8e342075f81344f9a3ae82a848c875be", "4bd1207833b54df697ca23f3efd6c2cf", "3c19074499e244b493bc05b5892a293e", "2d9f986a8aa04c3dba8312d54c82913c", "103d57b8ae3f4975ac5fd9061b5fec8b", "66d3257e622547c9ac7c62e2dfae5c4d", "3ed6fc142bd44e5880206414779a9323", "4dc2c1dd4f924b8ab4e2622378485618", "f769a72348494701a126e3f4ec8623d4", "4aa0ddf83a124c81bb219703ab6865c0", "95df07916da84852b10b03389956badf", "682089d120b54513a200529febfa2527", "d2d45c0d7dda40a0ba66c292e9bbc438", "e1941b55269f4a6dba612c5cb9258da9", "678c862655dc49289716419840ebf2e2", "4f6b121ffecf494cadd881ca8923a884", "3851c1f049384c41848fab5f9c1b523f", "0bce0d61ac4649d9b1e27857be36e6fe", "46ab0e39427a42f9b4ca789c49a3a126", "803dee5d9e6b4dfdb7b9595505e42c03", "fa4e3b1ac243447d9e46d66944547e48", "017e05e3e22b440dab59f06ee83960ff", "c39a3a6e301e423794786f4b15ff8b3d", "efafe6f4325045c5b408b88d5abec153", "48cc2483d0224f8887a76c7f7c8db8e3", "32da12d7423a4c868e7b29c54843c14b", "42f56261aad343828b91f787fc69d5ac", "f650e2ce30ee4d88b4eaa0cfa85a3661", "b8ae79eee2014b6b91542951727a3e37", "2adedc971b184035ae9d96762349f483", "29ae5378211a4e5e9c5c9b046392c76a", "bb22764c1aa3451f97ca079d96b3dbcc", "b52df37a9349480a8621fea91bc9077b", "5172fff7ca6a4c4f94b7292adc728777", "4d48c50082f4472c94b595573723ce15", "5f0ac99d23ce483ba0ac4ab755c443d2", "90427982a8d543bd8b0483aef1511902", "11d5e1cdbdfe4aaab827a49d332b7599", "8e96bffd00de4bde8a918e1496a776d2", "aad80fdde29e49f5a4685db03f57d58e", "32b440414b0a4662b3027c9ece7c74e8", "dfeebf31c9c540979b278684923b8826", "d6c74e7862db40648cf5249a4432b9d0", "e2afe9d5cfd84c9ba5870dd1708a0b16", "f59df2eb0c864ca5b7b25970bc7f2e87", "8c8127b9c8b74f14870776d4fdd9c5e8", "d7c708ea29f540578d11cf76c85b9437", "6cae27568cbf4488895c616f68fcd8e3", "9203b89a29234d5aaa95a0e4a201fdb0", "3e07199280444ab7bf439352acabc01a", "b179b62e34684bc48c6d5c84e13b9798", "295fe87770f74338bd68f2214faa0ae1", "c390fde6a5104351aae53b85bef6222b", "b22f8a6420814ecfadaa7ef5790b476c", "60af0f6acb9f4e5eb3a8eae4ef236eac", "bb1f645cf72c409b95b020b50ac1ae94", "b6e193b4e27242328f6fa4ad54de7409", "73e1a76997ee4be88d94ab0e0f060548", "28ab3538c465487ab240b421b076e7ae", "0f19af896f4a494487fccb7099bf394c", "e79c3a0913b442daa2a37eb823db1a85", "204046afafa64b52b9d9e83878a97014", "d8d6f054fb754d848ddf96b7194bf15f", "23db2da29870496ebbb1e04f84185cdf", "1e10171aec83459196bb06d34ceef98e", "406af1ddbe3b46cea390f5cc172b8369", "3470993795f6479f9575cb31891a8506", "22ac260bbf324ae4b9339cbb704f1c18", "2015bd9737594a1dbc9f175314f2c5cc", "b281d3b8a945468db47807e6075cbd63", "13c65f65783340ea95ac944ec27c36f4", "b3e113f634ec44e398af6d9d818a8ea9", "749748bb93b8489a9902c94286f46387", "e92d6fa0a06e477aafff5164b0e2fdd7", "2058678b763a4939bcb3b3671109064e", "1523a44a403b419fa607dc78559d0992", "b6bf64579058468fb48fc212f52f49a4", "ed1bdcb6a6094b1b9e84d93367fa5bfd" ] }, "id": "7MbpXGu-v4f1", "outputId": "88efbd0a-99c7-4672-ec60-ce646f13adb1" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "34c17806228640a99cb0aca8cf97fcb8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading readme: 0%| | 0.00/5.12k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/nicholasKluge___parquet/nicholasKluge--fine-tuning-instruct-aira-4077fd700c38fc36/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8b814a900b904583bc2443d454d05ac6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/4 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4947f03b2b88428e92c32b49baa9ed2c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/1.71M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "401d3f247c564de5b63a207e4649c075", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/13.5M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4bd1207833b54df697ca23f3efd6c2cf", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/14.5M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d2d45c0d7dda40a0ba66c292e9bbc438", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/1.84M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "efafe6f4325045c5b408b88d5abec153", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Extracting data files: 0%| | 0/4 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4d48c50082f4472c94b595573723ce15", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_english split: 0%| | 0/9183 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8c8127b9c8b74f14870776d4fdd9c5e8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_instruct_english split: 0%| | 0/48666 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b6e193b4e27242328f6fa4ad54de7409", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_instruct_portuguese split: 0%| | 0/48571 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "22ac260bbf324ae4b9339cbb704f1c18", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_portuguese split: 0%| | 0/9204 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/nicholasKluge___parquet/nicholasKluge--fine-tuning-instruct-aira-4077fd700c38fc36/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " | prompt | \n", "completion | \n", "
---|---|---|
0 | \n", "What is the best way to set up a home studio f... | \n", "The best way to set up a home studio for recor... | \n", "
1 | \n", "What should I know before I start a 529 plan f... | \n", "Before you start a 529 plan for your children'... | \n", "
2 | \n", "How do I get to the nearest mountain bike trail. | \n", "The best way to find the nearest mountain bike... | \n", "
3 | \n", "Why do people visit the national parks? | \n", "The national parks are a system of diverse nat... | \n", "
4 | \n", "I need to commission a piece of art. Where can... | \n", "If you are looking for an artist to commission... | \n", "
... | \n", "... | \n", "... | \n", "
48661 | \n", "What is the importance of perplexity in langua... | \n", "The concept of perplexity arises as a measure ... | \n", "
48662 | \n", "What are some ways to improve my studying habits. | \n", "Make a study schedule and stick to it.\\n2. Tak... | \n", "
48663 | \n", "What are some vegan recipes I can make quickly. | \n", "Vegan quinoa bowl with grilled vegetables \\n2.... | \n", "
48664 | \n", "Parsing the text of a legal contract. | \n", "I am tasked with parsing the text of a legal c... | \n", "
48665 | \n", "Provide some tips for growing healthy tomatoes... | \n", "Choose a sunny spot in your garden with at lea... | \n", "
48666 rows × 2 columns
\n", "