{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2023-12-05T04:04:03.809709Z", "iopub.status.busy": "2023-12-05T04:04:03.808818Z", "iopub.status.idle": "2023-12-05T04:04:21.308742Z", "shell.execute_reply": "2023-12-05T04:04:21.308088Z", "shell.execute_reply.started": "2023-12-05T04:04:03.809666Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.9/dist-packages (4.21.3)\n", "Collecting transformers\n", " Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.9/7.9 MB\u001b[0m \u001b[31m52.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: datasets in /usr/local/lib/python3.9/dist-packages (2.4.0)\n", "Collecting datasets\n", " Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m51.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting evaluate\n", " Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m18.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting accelerate\n", " Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m265.7/265.7 kB\u001b[0m \u001b[31m23.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.9/dist-packages (from transformers) (5.4.1)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from transformers) (23.0)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.9/dist-packages (from transformers) (4.64.1)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.9/dist-packages (from transformers) (2.28.2)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (1.23.4)\n", "Collecting huggingface-hub<1.0,>=0.16.4\n", " Downloading huggingface_hub-0.19.4-py3-none-any.whl (311 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.7/311.7 kB\u001b[0m \u001b[31m22.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting tokenizers<0.19,>=0.14\n", " Downloading tokenizers-0.15.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m63.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.9/dist-packages (from transformers) (3.9.0)\n", "Collecting safetensors>=0.3.1\n", " Downloading safetensors-0.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m64.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.9/dist-packages (from transformers) (2022.10.31)\n", "Requirement already satisfied: xxhash in /usr/local/lib/python3.9/dist-packages (from datasets) (3.2.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.9/dist-packages (from datasets) (3.8.3)\n", "Requirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.9/dist-packages (from datasets) (2023.1.0)\n", "Collecting pyarrow-hotfix\n", " Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.9/dist-packages (from datasets) (1.5.0)\n", "Requirement already satisfied: multiprocess in /usr/local/lib/python3.9/dist-packages (from datasets) (0.70.13)\n", "Requirement already satisfied: dill<0.3.8,>=0.3.0 in /usr/local/lib/python3.9/dist-packages (from datasets) (0.3.5.1)\n", "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.9/dist-packages (from datasets) (10.0.1)\n", "Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.9/dist-packages (from evaluate) (0.18.0)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.9/dist-packages (from accelerate) (5.9.4)\n", "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.9/dist-packages (from accelerate) (1.12.1+cu116)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.9/dist-packages (from aiohttp->datasets) (1.8.2)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.9/dist-packages (from aiohttp->datasets) (18.2.0)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.9/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from aiohttp->datasets) (1.3.3)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.9/dist-packages (from aiohttp->datasets) (4.0.2)\n", "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.9/dist-packages (from aiohttp->datasets) (2.1.1)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.9/dist-packages (from aiohttp->datasets) (6.0.4)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (4.4.0)\n", "Collecting huggingface-hub<1.0,>=0.16.4\n", " Downloading huggingface_hub-0.19.3-py3-none-any.whl (311 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.2/311.2 kB\u001b[0m \u001b[31m45.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading huggingface_hub-0.19.2-py3-none-any.whl (311 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.2/311.2 kB\u001b[0m \u001b[31m49.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading huggingface_hub-0.19.1-py3-none-any.whl (311 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.1/311.1 kB\u001b[0m \u001b[31m42.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.2/311.2 kB\u001b[0m \u001b[31m33.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.0/302.0 kB\u001b[0m \u001b[31m44.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hINFO: pip is looking at multiple versions of aiohttp to determine which version is compatible with other requirements. This could take a while.\n", "Collecting aiohttp\n", " Downloading aiohttp-3.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m33.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", "\u001b[?25hINFO: pip is looking at multiple versions of fsspec[http] to determine which version is compatible with other requirements. This could take a while.\n", "Collecting fsspec[http]<=2023.10.0,>=2023.1.0\n", " Downloading fsspec-2023.10.0-py3-none-any.whl (166 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m166.4/166.4 kB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests->transformers) (2.8)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests->transformers) (2019.11.28)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests->transformers) (1.26.14)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas->datasets) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas->datasets) (2022.7.1)\n", "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.14.0)\n", "Installing collected packages: safetensors, pyarrow-hotfix, fsspec, huggingface-hub, tokenizers, accelerate, transformers, datasets, evaluate\n", " Attempting uninstall: fsspec\n", " Found existing installation: fsspec 2023.1.0\n", " Uninstalling fsspec-2023.1.0:\n", " Successfully uninstalled fsspec-2023.1.0\n", " Attempting uninstall: huggingface-hub\n", " Found existing installation: huggingface-hub 0.12.0\n", " Uninstalling huggingface-hub-0.12.0:\n", " Successfully uninstalled huggingface-hub-0.12.0\n", " Attempting uninstall: tokenizers\n", " Found existing installation: tokenizers 0.12.1\n", " Uninstalling tokenizers-0.12.1:\n", " Successfully uninstalled tokenizers-0.12.1\n", " Attempting uninstall: transformers\n", " Found existing installation: transformers 4.21.3\n", " Uninstalling transformers-4.21.3:\n", " Successfully uninstalled transformers-4.21.3\n", " Attempting uninstall: datasets\n", " Found existing installation: datasets 2.4.0\n", " Uninstalling datasets-2.4.0:\n", " Successfully uninstalled datasets-2.4.0\n", "Successfully installed accelerate-0.25.0 datasets-2.15.0 evaluate-0.4.1 fsspec-2023.10.0 huggingface-hub-0.19.4 pyarrow-hotfix-0.6 safetensors-0.4.1 tokenizers-0.15.0 transformers-4.35.2\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install -U transformers datasets evaluate accelerate" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2023-12-05T04:04:21.310409Z", "iopub.status.busy": "2023-12-05T04:04:21.310130Z", "iopub.status.idle": "2023-12-05T04:04:21.783747Z", "shell.execute_reply": "2023-12-05T04:04:21.783079Z", "shell.execute_reply.started": "2023-12-05T04:04:21.310381Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.\n", "Token is valid (permission: write).\n", "Your token has been saved to /root/.cache/huggingface/token\n", "Login successful\n" ] } ], "source": [ "from huggingface_hub import login\n", "\n", "login('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')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2023-12-05T04:04:21.785174Z", "iopub.status.busy": "2023-12-05T04:04:21.784648Z", "iopub.status.idle": "2023-12-05T04:04:24.580049Z", "shell.execute_reply": "2023-12-05T04:04:24.579275Z", "shell.execute_reply.started": "2023-12-05T04:04:21.785148Z" } }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4332346192424ecba545a41bfa038983", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading readme: 0%| | 0.00/714 [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "912c3c39020b435f9d5fd336a73af546", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/1 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "cde2a96564724b0cb55e049c7d388c0b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/10.9M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "9867623f3a48416c9cd09ae76b2ac1fc", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Extracting data files: 0%| | 0/1 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8c80f1d324f741799078d136d73950db", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating train split: 0%| | 0/42608 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"sivan22/orach-chaim\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2023-12-05T04:04:24.582217Z", "iopub.status.busy": "2023-12-05T04:04:24.581855Z", "iopub.status.idle": "2023-12-05T04:04:24.588662Z", "shell.execute_reply": "2023-12-05T04:04:24.587648Z", "shell.execute_reply.started": "2023-12-05T04:04:24.582192Z" } }, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['Unnamed: 0', 'bookname', 'siman', 'sek', 'text', 'seif', 'topic'],\n", " num_rows: 42608\n", " })\n", "})" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2023-12-05T04:04:24.592319Z", "iopub.status.busy": "2023-12-05T04:04:24.592074Z", "iopub.status.idle": "2023-12-05T04:04:24.600368Z", "shell.execute_reply": "2023-12-05T04:04:24.599325Z", "shell.execute_reply.started": "2023-12-05T04:04:24.592293Z" } }, "outputs": [ { "data": { "text/plain": [ "{'Unnamed: 0': 24246,\n", " 'bookname': ' משנה ברורה',\n", " 'siman': 'תלא',\n", " 'sek': 'ט',\n", " 'text': ' ואם התחיל ללמוד: וה\"ה שאר מלאכות: ',\n", " 'seif': ' ב',\n", " 'topic': None}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from random import randint\n", "rnd = randint(0,len(dataset[\"train\"]))\n", "dataset[\"train\"][rnd]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2023-12-05T04:04:24.605031Z", "iopub.status.busy": "2023-12-05T04:04:24.604696Z", "iopub.status.idle": "2023-12-05T04:04:24.704105Z", "shell.execute_reply": "2023-12-05T04:04:24.701926Z", "shell.execute_reply.started": "2023-12-05T04:04:24.605005Z" } }, "outputs": [], "source": [ "dataset[\"train\"] = dataset[\"train\"].add_column( \"label\",[0] * len(dataset[\"train\"],))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2023-12-05T04:04:24.710590Z", "iopub.status.busy": "2023-12-05T04:04:24.710344Z", "iopub.status.idle": "2023-12-05T04:04:28.345669Z", "shell.execute_reply": "2023-12-05T04:04:28.344609Z", "shell.execute_reply.started": "2023-12-05T04:04:24.710566Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", " | Unnamed: 0 | \n", "bookname | \n", "siman | \n", "sek | \n", "text | \n", "seif | \n", "topic | \n", "label | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "עטרת זקנים על אורח חיים | \n", "א | \n", "א | \n", "ומיד כשיעור משנתו ואינו רוצ' לישן יטול ידיו אף... | \n", "א | \n", "None | \n", "0 | \n", "
1 | \n", "1 | \n", "עטרת זקנים על אורח חיים | \n", "א | \n", "ב | \n", "המשכים לעסוק בתור' ילבוש עצמו כראוי להכין לקרא... | \n", "ב | \n", "None | \n", "0 | \n", "
2 | \n", "2 | \n", "עטרת זקנים על אורח חיים | \n", "א | \n", "ג | \n", "ראוי. צריך כ\"א להשתת' עם צער' של רחל לבכות בכל... | \n", "ג | \n", "None | \n", "0 | \n", "
3 | \n", "3 | \n", "עטרת זקנים על אורח חיים | \n", "א | \n", "ד | \n", "טוב לומר סדר הקרבנות דוקא ביום רק משנת איזהו מ... | \n", "ה | \n", "None | \n", "0 | \n", "
4 | \n", "4 | \n", "עטרת זקנים על אורח חיים | \n", "א | \n", "ה | \n", "כשיסיים. אחר חטאת ואשם ודאי לא יאמר משום דהאמי... | \n", "ז | \n", "None | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
42603 | \n", "42603 | \n", "אשל אברהם על שולחן ערוך אורח חיים | \n", "תרצה | \n", "ב | \n", "מלשלם. עיין רא\"ש סוכה דף מ\"ה ות' בי\"ד סי' תל\"ד: | \n", "ב | \n", "None | \n", "0 | \n", "
42604 | \n", "42604 | \n", "אשל אברהם על שולחן ערוך אורח חיים | \n", "תרצו | \n", "א | \n", "שלא לעשות. וכשחל בע\"ש מותר לגלח לכבוד שבת לכ\"... | \n", "א | \n", "None | \n", "0 | \n", "
42605 | \n", "42605 | \n", "אשל אברהם על שולחן ערוך אורח חיים | \n", "תרצו | \n", "ב | \n", "ותענית. ות\"ח שרי אלא דצריך למיתב תענית לתענית... | \n", "ג | \n", "None | \n", "0 | \n", "
42606 | \n", "42606 | \n", "אשל אברהם על שולחן ערוך אורח חיים | \n", "תרצו | \n", "ג | \n", "נוהגים. ע' באר היטב לי\"ד סי' ת\"א וכמ\"ש שם: | \n", "ד | \n", "None | \n", "0 | \n", "
42607 | \n", "42607 | \n", "אשל אברהם על שולחן ערוך אורח חיים | \n", "תרצו | \n", "ד | \n", "במקרא מגילה. ע' ת' בי\"ד סי' תצ\"ו: | \n", "ז | \n", "None | \n", "0 | \n", "
42608 rows × 8 columns
\n", "/notebooks/wandb/run-20231205_040508-3ptlrafb
"
],
"text/plain": [
"Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "Accuracy | \n", "
---|---|---|---|
1 | \n", "5.281300 | \n", "4.988975 | \n", "0.291716 | \n", "
2 | \n", "4.603500 | \n", "4.606649 | \n", "0.330674 | \n", "
"
],
"text/plain": [
"