diff --git "a/notebooks/07r2_tune-lf-py3.11.ipynb" "b/notebooks/07r2_tune-lf-py3.11.ipynb"
deleted file mode 100644--- "a/notebooks/07r2_tune-lf-py3.11.ipynb"
+++ /dev/null
@@ -1,9938 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {},
-     "inputWidgets": {},
-     "nuid": "0ea8b46b-839b-445b-8043-ccdf4e920ace",
-     "showTitle": false,
-     "title": ""
-    }
-   },
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {},
-     "inputWidgets": {},
-     "nuid": "6d394937-6c99-4a7c-9d32-7600a280032f",
-     "showTitle": false,
-     "title": ""
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "workding dir: /home/inflaton/code/projects/courses/llm-finetuning\n"
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "import sys\n",
-    "from pathlib import Path\n",
-    "\n",
-    "workding_dir = str(Path.cwd().parent)\n",
-    "os.chdir(workding_dir)\n",
-    "sys.path.append(workding_dir)\n",
-    "print(\"workding dir:\", workding_dir)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {},
-     "inputWidgets": {},
-     "nuid": "9f67ec60-2f24-411c-84eb-0dd664b44775",
-     "showTitle": false,
-     "title": ""
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "loading env vars from: /home/inflaton/code/projects/courses/llm-finetuning/.env\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from dotenv import find_dotenv, load_dotenv\n",
-    "\n",
-    "found_dotenv = find_dotenv(\".env\")\n",
-    "\n",
-    "if len(found_dotenv) == 0:\n",
-    "    found_dotenv = find_dotenv(\".env.example\")\n",
-    "print(f\"loading env vars from: {found_dotenv}\")\n",
-    "load_dotenv(found_dotenv, override=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "application/vnd.databricks.v1+cell": {
-     "cellMetadata": {},
-     "inputWidgets": {},
-     "nuid": "f1597656-8042-4878-9d3b-9ebfb8dd86dc",
-     "showTitle": false,
-     "title": ""
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "('unsloth/Qwen2-0.5B-Instruct',\n",
-       " True,\n",
-       " None,\n",
-       " None,\n",
-       " 2048,\n",
-       " 6,\n",
-       " None,\n",
-       " 'datasets/mac/mac.tsv',\n",
-       " 'results/mac-results_lf-r2.csv')"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import os\n",
-    "\n",
-    "model_name = os.getenv(\"MODEL_NAME\")\n",
-    "token = os.getenv(\"HF_TOKEN\") or None\n",
-    "load_in_4bit = os.getenv(\"LOAD_IN_4BIT\") == \"true\"\n",
-    "local_model = os.getenv(\"LOCAL_MODEL\")\n",
-    "hub_model = os.getenv(\"HUB_MODEL\")\n",
-    "num_train_epochs = int(os.getenv(\"NUM_TRAIN_EPOCHS\") or 0)\n",
-    "data_path = os.getenv(\"DATA_PATH\")\n",
-    "results_path = os.getenv(\"RESULTS_PATH\")\n",
-    "\n",
-    "max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!\n",
-    "dtype = (\n",
-    "    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n",
-    ")\n",
-    "\n",
-    "model_name, load_in_4bit, local_model, hub_model, max_seq_length, num_train_epochs, dtype, data_path, results_path"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Thu Jul  4 11:06:16 2024       \n",
-      "+---------------------------------------------------------------------------------------+\n",
-      "| NVIDIA-SMI 545.23.07              Driver Version: 546.12       CUDA Version: 12.3     |\n",
-      "|-----------------------------------------+----------------------+----------------------+\n",
-      "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
-      "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
-      "|                                         |                      |               MIG M. |\n",
-      "|=========================================+======================+======================|\n",
-      "|   0  NVIDIA GeForce RTX 4080 ...    On  | 00000000:01:00.0 Off |                  N/A |\n",
-      "| N/A   52C    P8               3W / 150W |    355MiB / 12282MiB |      0%      Default |\n",
-      "|                                         |                      |                  N/A |\n",
-      "+-----------------------------------------+----------------------+----------------------+\n",
-      "                                                                                         \n",
-      "+---------------------------------------------------------------------------------------+\n",
-      "| Processes:                                                                            |\n",
-      "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
-      "|        ID   ID                                                             Usage      |\n",
-      "|=======================================================================================|\n",
-      "|  No running processes found                                                           |\n",
-      "+---------------------------------------------------------------------------------------+\n"
-     ]
-    }
-   ],
-   "source": [
-    "!nvidia-smi"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[nltk_data] Downloading package wordnet to /home/inflaton/nltk_data...\n",
-      "[nltk_data]   Package wordnet is already up-to-date!\n",
-      "[nltk_data] Downloading package punkt to /home/inflaton/nltk_data...\n",
-      "[nltk_data]   Package punkt is already up-to-date!\n",
-      "[nltk_data] Downloading package omw-1.4 to /home/inflaton/nltk_data...\n",
-      "[nltk_data]   Package omw-1.4 is already up-to-date!\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "loading /home/inflaton/code/projects/courses/llm-finetuning/llm_toolkit/translation_engine.py\n",
-      "loading train/test data files\n",
-      "DatasetDict({\n",
-      "    train: Dataset({\n",
-      "        features: ['chinese', 'english'],\n",
-      "        num_rows: 4528\n",
-      "    })\n",
-      "    test: Dataset({\n",
-      "        features: ['chinese', 'english'],\n",
-      "        num_rows: 1133\n",
-      "    })\n",
-      "})\n"
-     ]
-    }
-   ],
-   "source": [
-    "from llm_toolkit.translation_engine import load_translation_dataset\n",
-    "\n",
-    "dataset = load_translation_dataset(data_path)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = dataset[\"train\"].to_pandas()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "df_alpaca = pd.DataFrame({\"instruction\": [\"Please translate the following Chinese text into English and provide only the translated content, nothing else.\"]*len(df)})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>instruction</th>\n",
-       "      <th>input</th>\n",
-       "      <th>output</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>全仗着狐仙搭救。</td>\n",
-       "      <td>Because I was protected by a fox fairy.</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>过后，表哥告诉她俩，这人是导演，在外国留过学的，还会编剧，今天拍的这戏，就是他自编自导的。</td>\n",
-       "      <td>He was the director, the cousin later told the...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>这凤姐忽然想起一件事来，便向窗外叫：“蓉儿回来！”</td>\n",
-       "      <td>Xi-feng suddenly seemed to remember something,...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>三个老红卫兵走到叶文洁面前，面对着她站成了一排——当年，她们也是这样面对叶哲泰的——试图再现...</td>\n",
-       "      <td>The three old Red Guards stood in front of Ye ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>程先生照单全收，都是一个“谢”字，然后问王琦瑶有什么话说。</td>\n",
-       "      <td>Mr. Cheng accepted their toast with equanimity...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4523</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>外边有两张腿歪面裂的八仙桌子，桌旁胡乱搡着几条狭窄的木凳。</td>\n",
-       "      <td>Two rickety tables with scarred tops and a few...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4524</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>贾瑞听了，喜的抓耳挠腮。</td>\n",
-       "      <td>At this last remark Jia Rui positively scratch...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4525</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>听了这样的评价，我们心情激动，和大家一起振臂高呼：打倒王二！</td>\n",
-       "      <td>Hearing comments like this, our emotions were ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4526</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>海老公道：“记住了吗？”</td>\n",
-       "      <td>'Can you remember that?'</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4527</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>上面说，这样写缺少细节。</td>\n",
-       "      <td>This time the opinions from above said it need...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>4528 rows × 3 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                            instruction  \\\n",
-       "0     Please translate the following Chinese text in...   \n",
-       "1     Please translate the following Chinese text in...   \n",
-       "2     Please translate the following Chinese text in...   \n",
-       "3     Please translate the following Chinese text in...   \n",
-       "4     Please translate the following Chinese text in...   \n",
-       "...                                                 ...   \n",
-       "4523  Please translate the following Chinese text in...   \n",
-       "4524  Please translate the following Chinese text in...   \n",
-       "4525  Please translate the following Chinese text in...   \n",
-       "4526  Please translate the following Chinese text in...   \n",
-       "4527  Please translate the following Chinese text in...   \n",
-       "\n",
-       "                                                  input  \\\n",
-       "0                                              全仗着狐仙搭救。   \n",
-       "1         过后，表哥告诉她俩，这人是导演，在外国留过学的，还会编剧，今天拍的这戏，就是他自编自导的。   \n",
-       "2                             这凤姐忽然想起一件事来，便向窗外叫：“蓉儿回来！”   \n",
-       "3     三个老红卫兵走到叶文洁面前，面对着她站成了一排——当年，她们也是这样面对叶哲泰的——试图再现...   \n",
-       "4                         程先生照单全收，都是一个“谢”字，然后问王琦瑶有什么话说。   \n",
-       "...                                                 ...   \n",
-       "4523                      外边有两张腿歪面裂的八仙桌子，桌旁胡乱搡着几条狭窄的木凳。   \n",
-       "4524                                       贾瑞听了，喜的抓耳挠腮。   \n",
-       "4525                     听了这样的评价，我们心情激动，和大家一起振臂高呼：打倒王二！   \n",
-       "4526                                       海老公道：“记住了吗？”   \n",
-       "4527                                       上面说，这样写缺少细节。   \n",
-       "\n",
-       "                                                 output  \n",
-       "0               Because I was protected by a fox fairy.  \n",
-       "1     He was the director, the cousin later told the...  \n",
-       "2     Xi-feng suddenly seemed to remember something,...  \n",
-       "3     The three old Red Guards stood in front of Ye ...  \n",
-       "4     Mr. Cheng accepted their toast with equanimity...  \n",
-       "...                                                 ...  \n",
-       "4523  Two rickety tables with scarred tops and a few...  \n",
-       "4524  At this last remark Jia Rui positively scratch...  \n",
-       "4525  Hearing comments like this, our emotions were ...  \n",
-       "4526                           'Can you remember that?'  \n",
-       "4527  This time the opinions from above said it need...  \n",
-       "\n",
-       "[4528 rows x 3 columns]"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_alpaca[\"input\"] = df[\"chinese\"]\n",
-    "df_alpaca[\"output\"] = df[\"english\"]\n",
-    "df_alpaca"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_alpaca.to_json(\n",
-    "    \"llama-factory/data/alpaca_mac.json\", orient=\"records\", lines=False, indent=2\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.read_json(\"llama-factory/data/alpaca_mac.json\", orient=\"records\", lines=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>instruction</th>\n",
-       "      <th>input</th>\n",
-       "      <th>output</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>全仗着狐仙搭救。</td>\n",
-       "      <td>Because I was protected by a fox fairy.</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>过后，表哥告诉她俩，这人是导演，在外国留过学的，还会编剧，今天拍的这戏，就是他自编自导的。</td>\n",
-       "      <td>He was the director, the cousin later told the...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>这凤姐忽然想起一件事来，便向窗外叫：“蓉儿回来！”</td>\n",
-       "      <td>Xi-feng suddenly seemed to remember something,...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>三个老红卫兵走到叶文洁面前，面对着她站成了一排——当年，她们也是这样面对叶哲泰的——试图再现...</td>\n",
-       "      <td>The three old Red Guards stood in front of Ye ...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Please translate the following Chinese text in...</td>\n",
-       "      <td>程先生照单全收，都是一个“谢”字，然后问王琦瑶有什么话说。</td>\n",
-       "      <td>Mr. Cheng accepted their toast with equanimity...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                         instruction  \\\n",
-       "0  Please translate the following Chinese text in...   \n",
-       "1  Please translate the following Chinese text in...   \n",
-       "2  Please translate the following Chinese text in...   \n",
-       "3  Please translate the following Chinese text in...   \n",
-       "4  Please translate the following Chinese text in...   \n",
-       "\n",
-       "                                               input  \\\n",
-       "0                                           全仗着狐仙搭救。   \n",
-       "1      过后，表哥告诉她俩，这人是导演，在外国留过学的，还会编剧，今天拍的这戏，就是他自编自导的。   \n",
-       "2                          这凤姐忽然想起一件事来，便向窗外叫：“蓉儿回来！”   \n",
-       "3  三个老红卫兵走到叶文洁面前，面对着她站成了一排——当年，她们也是这样面对叶哲泰的——试图再现...   \n",
-       "4                      程先生照单全收，都是一个“谢”字，然后问王琦瑶有什么话说。   \n",
-       "\n",
-       "                                              output  \n",
-       "0            Because I was protected by a fox fairy.  \n",
-       "1  He was the director, the cousin later told the...  \n",
-       "2  Xi-feng suddenly seemed to remember something,...  \n",
-       "3  The three old Red Guards stood in front of Ye ...  \n",
-       "4  Mr. Cheng accepted their toast with equanimity...  "
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Python 3.11.9\n",
-      "\u001b[33mWARNING: Package(s) not found: flash-attn\u001b[0m\u001b[33m\n",
-      "\u001b[0mCPU times: user 5.39 ms, sys: 19.5 ms, total: 24.9 ms\n",
-      "Wall time: 527 ms\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "!python --version\n",
-    "!pip show flash-attn"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Current Directory:\n",
-      "/home/inflaton/code/projects/courses/llm-finetuning/llama-factory\n",
-      "07/04/2024 11:09:05 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 11:09:06,545 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/vocab.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 11:09:06,545 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/merges.txt\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 11:09:06,545 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/tokenizer.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 11:09:06,545 >> loading file added_tokens.json from cache at None\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 11:09:06,545 >> loading file special_tokens_map.json from cache at None\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 11:09:06,545 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/tokenizer_config.json\n",
-      "[WARNING|logging.py:313] 2024-07-04 11:09:06,662 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-      "07/04/2024 11:09:06 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>\n",
-      "07/04/2024 11:09:06 - INFO - llamafactory.data.template - Add <|im_start|> to stop words.\n",
-      "07/04/2024 11:09:06 - INFO - llamafactory.data.loader - Loading dataset alpaca_mac.json...\n",
-      "Converting format of dataset (num_proc=16): 100%|█| 4528/4528 [00:00<00:00, 1685\n",
-      "Running tokenizer on dataset (num_proc=16): 100%|█| 4528/4528 [00:01<00:00, 3476\n",
-      "input_ids:\n",
-      "[151644, 872, 198, 5501, 14683, 279, 2701, 8453, 1467, 1119, 6364, 323, 3410, 1172, 279, 24531, 2213, 11, 4302, 770, 624, 35987, 102895, 99164, 100324, 100717, 100095, 99509, 1773, 151645, 198, 151644, 77091, 198, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n",
-      "inputs:\n",
-      "<|im_start|>user\n",
-      "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n",
-      "全仗着狐仙搭救。<|im_end|>\n",
-      "<|im_start|>assistant\n",
-      "Because I was protected by a fox fairy.<|im_end|>\n",
-      "label_ids:\n",
-      "[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n",
-      "labels:\n",
-      "Because I was protected by a fox fairy.<|im_end|>\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 11:09:09,749 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 11:09:09,750 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-0.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|modeling_utils.py:3556] 2024-07-04 11:09:09,841 >> loading weights file model.safetensors from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/model.safetensors\n",
-      "[INFO|modeling_utils.py:1531] 2024-07-04 11:09:13,066 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.\n",
-      "[INFO|configuration_utils.py:1000] 2024-07-04 11:09:13,069 >> Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645\n",
-      "}\n",
-      "\n",
-      "[INFO|modeling_utils.py:4364] 2024-07-04 11:10:03,269 >> All model checkpoint weights were used when initializing Qwen2ForCausalLM.\n",
-      "\n",
-      "[INFO|modeling_utils.py:4372] 2024-07-04 11:10:03,270 >> All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at Qwen/Qwen2-0.5B-Instruct.\n",
-      "If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.\n",
-      "[INFO|configuration_utils.py:955] 2024-07-04 11:10:03,578 >> loading configuration file generation_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/generation_config.json\n",
-      "[INFO|configuration_utils.py:1000] 2024-07-04 11:10:03,578 >> Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"do_sample\": true,\n",
-      "  \"eos_token_id\": [\n",
-      "    151645,\n",
-      "    151643\n",
-      "  ],\n",
-      "  \"pad_token_id\": 151643,\n",
-      "  \"repetition_penalty\": 1.1,\n",
-      "  \"temperature\": 0.7,\n",
-      "  \"top_k\": 20,\n",
-      "  \"top_p\": 0.8\n",
-      "}\n",
-      "\n",
-      "07/04/2024 11:10:03 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.\n",
-      "07/04/2024 11:10:03 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.\n",
-      "07/04/2024 11:10:03 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.\n",
-      "07/04/2024 11:10:03 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA\n",
-      "07/04/2024 11:10:03 - INFO - llamafactory.model.model_utils.misc - Found linear modules: up_proj,down_proj,k_proj,q_proj,v_proj,o_proj,gate_proj\n",
-      "07/04/2024 11:10:04 - INFO - llamafactory.model.loader - trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826\n",
-      "[INFO|trainer.py:642] 2024-07-04 11:10:04,049 >> Using auto half precision backend\n",
-      "07/04/2024 11:10:04 - WARNING - llamafactory.train.callbacks - Previous trainer log in this folder will be deleted.\n",
-      "[INFO|trainer.py:2128] 2024-07-04 11:10:04,194 >> ***** Running training *****\n",
-      "[INFO|trainer.py:2129] 2024-07-04 11:10:04,194 >>   Num examples = 4,482\n",
-      "[INFO|trainer.py:2130] 2024-07-04 11:10:04,194 >>   Num Epochs = 6\n",
-      "[INFO|trainer.py:2131] 2024-07-04 11:10:04,194 >>   Instantaneous batch size per device = 1\n",
-      "[INFO|trainer.py:2134] 2024-07-04 11:10:04,194 >>   Total train batch size (w. parallel, distributed & accumulation) = 8\n",
-      "[INFO|trainer.py:2135] 2024-07-04 11:10:04,194 >>   Gradient Accumulation steps = 8\n",
-      "[INFO|trainer.py:2136] 2024-07-04 11:10:04,195 >>   Total optimization steps = 3,360\n",
-      "[INFO|trainer.py:2137] 2024-07-04 11:10:04,196 >>   Number of trainable parameters = 4,399,104\n",
-      "[INFO|integration_utils.py:750] 2024-07-04 11:10:04,198 >> Automatic Weights & Biases logging enabled, to disable set os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33minflaton-sg\u001b[0m (\u001b[33minflaton-ai\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.17.4\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m/home/inflaton/code/projects/courses/llm-finetuning/llama-factory/wandb/run-20240704_111005-u8sqhi0x\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mqwen2_0.5b_lora_sft\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface/runs/u8sqhi0x\u001b[0m\n",
-      "{'loss': 2.581, 'grad_norm': 2.9743993282318115, 'learning_rate': 2.9761904761904763e-06, 'epoch': 0.02}\n",
-      "{'loss': 2.704, 'grad_norm': 3.803558826446533, 'learning_rate': 5.9523809523809525e-06, 'epoch': 0.04}\n",
-      "{'loss': 2.5764, 'grad_norm': 2.419433116912842, 'learning_rate': 8.92857142857143e-06, 'epoch': 0.05}\n",
-      "{'loss': 2.4994, 'grad_norm': 4.8528876304626465, 'learning_rate': 1.1904761904761905e-05, 'epoch': 0.07}\n",
-      "{'loss': 2.6881, 'grad_norm': 2.5375239849090576, 'learning_rate': 1.4880952380952381e-05, 'epoch': 0.09}\n",
-      "{'loss': 2.3869, 'grad_norm': 2.810744524002075, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.11}\n",
-      "{'loss': 2.5728, 'grad_norm': 2.6387815475463867, 'learning_rate': 2.0833333333333336e-05, 'epoch': 0.12}\n",
-      "{'loss': 2.3077, 'grad_norm': 2.4742910861968994, 'learning_rate': 2.380952380952381e-05, 'epoch': 0.14}\n",
-      "{'loss': 2.4318, 'grad_norm': 3.0079479217529297, 'learning_rate': 2.6785714285714288e-05, 'epoch': 0.16}\n",
-      "{'loss': 2.29, 'grad_norm': 2.584622859954834, 'learning_rate': 2.9761904761904762e-05, 'epoch': 0.18}\n",
-      "{'loss': 2.3407, 'grad_norm': 3.3264784812927246, 'learning_rate': 3.273809523809524e-05, 'epoch': 0.2}\n",
-      "{'loss': 2.3577, 'grad_norm': 2.667269468307495, 'learning_rate': 3.571428571428572e-05, 'epoch': 0.21}\n",
-      "{'loss': 2.2612, 'grad_norm': 2.8811182975769043, 'learning_rate': 3.8690476190476195e-05, 'epoch': 0.23}\n",
-      "{'loss': 2.3096, 'grad_norm': 3.249279499053955, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.25}\n",
-      "{'loss': 2.183, 'grad_norm': 2.5008630752563477, 'learning_rate': 4.464285714285715e-05, 'epoch': 0.27}\n",
-      "{'loss': 2.23, 'grad_norm': 2.457791328430176, 'learning_rate': 4.761904761904762e-05, 'epoch': 0.29}\n",
-      "{'loss': 2.3025, 'grad_norm': 2.4453022480010986, 'learning_rate': 5.05952380952381e-05, 'epoch': 0.3}\n",
-      "{'loss': 2.0884, 'grad_norm': 2.7773451805114746, 'learning_rate': 5.3571428571428575e-05, 'epoch': 0.32}\n",
-      "{'loss': 2.2048, 'grad_norm': 3.600346565246582, 'learning_rate': 5.6547619047619046e-05, 'epoch': 0.34}\n",
-      "{'loss': 2.3676, 'grad_norm': 2.939140796661377, 'learning_rate': 5.9523809523809524e-05, 'epoch': 0.36}\n",
-      "{'loss': 2.2684, 'grad_norm': 2.7832212448120117, 'learning_rate': 6.25e-05, 'epoch': 0.37}\n",
-      "{'loss': 2.2021, 'grad_norm': 3.7691140174865723, 'learning_rate': 6.547619047619048e-05, 'epoch': 0.39}\n",
-      "{'loss': 2.1625, 'grad_norm': 3.3338756561279297, 'learning_rate': 6.845238095238096e-05, 'epoch': 0.41}\n",
-      "{'loss': 2.3564, 'grad_norm': 4.061848163604736, 'learning_rate': 7.142857142857143e-05, 'epoch': 0.43}\n",
-      "{'loss': 2.2266, 'grad_norm': 3.3382863998413086, 'learning_rate': 7.440476190476191e-05, 'epoch': 0.45}\n",
-      "{'loss': 2.1837, 'grad_norm': 3.208007335662842, 'learning_rate': 7.738095238095239e-05, 'epoch': 0.46}\n",
-      "{'loss': 2.1765, 'grad_norm': 4.045449733734131, 'learning_rate': 8.035714285714287e-05, 'epoch': 0.48}\n",
-      "{'loss': 2.2863, 'grad_norm': 4.37124490737915, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.5}\n",
-      "{'loss': 2.0807, 'grad_norm': 2.6629326343536377, 'learning_rate': 8.630952380952382e-05, 'epoch': 0.52}\n",
-      "{'loss': 2.2086, 'grad_norm': 3.6005942821502686, 'learning_rate': 8.92857142857143e-05, 'epoch': 0.54}\n",
-      "{'loss': 2.2231, 'grad_norm': 4.065690040588379, 'learning_rate': 9.226190476190478e-05, 'epoch': 0.55}\n",
-      "{'loss': 1.9875, 'grad_norm': 6.6260294914245605, 'learning_rate': 9.523809523809524e-05, 'epoch': 0.57}\n",
-      "{'loss': 2.0721, 'grad_norm': 4.6804656982421875, 'learning_rate': 9.821428571428572e-05, 'epoch': 0.59}\n",
-      "{'loss': 2.1194, 'grad_norm': 4.226340293884277, 'learning_rate': 9.999956828659095e-05, 'epoch': 0.61}\n",
-      "{'loss': 2.1256, 'grad_norm': 4.530922889709473, 'learning_rate': 9.999471159635539e-05, 'epoch': 0.62}\n",
-      "{'loss': 2.0243, 'grad_norm': 3.235328197479248, 'learning_rate': 9.998445910004082e-05, 'epoch': 0.64}\n",
-      "{'loss': 2.2819, 'grad_norm': 4.247537136077881, 'learning_rate': 9.996881190417393e-05, 'epoch': 0.66}\n",
-      "{'loss': 2.1964, 'grad_norm': 3.339164972305298, 'learning_rate': 9.994777169751806e-05, 'epoch': 0.68}\n",
-      "{'loss': 1.9102, 'grad_norm': 2.744009494781494, 'learning_rate': 9.992134075089084e-05, 'epoch': 0.7}\n",
-      "{'loss': 2.0751, 'grad_norm': 3.513111114501953, 'learning_rate': 9.988952191691925e-05, 'epoch': 0.71}\n",
-      "{'loss': 2.1697, 'grad_norm': 3.301513433456421, 'learning_rate': 9.985231862973168e-05, 'epoch': 0.73}\n",
-      "{'loss': 2.1742, 'grad_norm': 2.8456363677978516, 'learning_rate': 9.980973490458728e-05, 'epoch': 0.75}\n",
-      "{'loss': 2.1497, 'grad_norm': 3.499181032180786, 'learning_rate': 9.976177533744261e-05, 'epoch': 0.77}\n",
-      "{'loss': 2.0643, 'grad_norm': 4.2905964851379395, 'learning_rate': 9.97084451044556e-05, 'epoch': 0.79}\n",
-      "{'loss': 1.9934, 'grad_norm': 2.706711769104004, 'learning_rate': 9.964974996142698e-05, 'epoch': 0.8}\n",
-      "{'loss': 2.0795, 'grad_norm': 3.038059949874878, 'learning_rate': 9.958569624317893e-05, 'epoch': 0.82}\n",
-      "{'loss': 2.0908, 'grad_norm': 4.291042804718018, 'learning_rate': 9.951629086287151e-05, 'epoch': 0.84}\n",
-      "{'loss': 2.105, 'grad_norm': 3.027702808380127, 'learning_rate': 9.944154131125642e-05, 'epoch': 0.86}\n",
-      "{'loss': 2.112, 'grad_norm': 2.6875832080841064, 'learning_rate': 9.936145565586871e-05, 'epoch': 0.87}\n",
-      "{'loss': 2.2824, 'grad_norm': 2.8110086917877197, 'learning_rate': 9.927604254015585e-05, 'epoch': 0.89}\n",
-      "{'loss': 2.2181, 'grad_norm': 3.3072471618652344, 'learning_rate': 9.918531118254507e-05, 'epoch': 0.91}\n",
-      "{'loss': 2.1132, 'grad_norm': 3.8883237838745117, 'learning_rate': 9.90892713754483e-05, 'epoch': 0.93}\n",
-      "{'loss': 2.1513, 'grad_norm': 3.775455951690674, 'learning_rate': 9.898793348420536e-05, 'epoch': 0.95}\n",
-      "{'loss': 2.1119, 'grad_norm': 3.0280404090881348, 'learning_rate': 9.888130844596524e-05, 'epoch': 0.96}\n",
-      "{'loss': 2.1126, 'grad_norm': 3.2323291301727295, 'learning_rate': 9.876940776850569e-05, 'epoch': 0.98}\n",
-      "{'loss': 2.1328, 'grad_norm': 2.91339373588562, 'learning_rate': 9.865224352899119e-05, 'epoch': 1.0}\n",
-      " 17%|██████▎                               | 560/3360 [12:30<1:02:53,  1.35s/it][INFO|trainer.py:3788] 2024-07-04 11:22:39,524 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 11:22:39,524 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 11:22:39,524 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:00<00:01, 33.16it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:00<00:01, 29.60it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:00<00:01, 27.75it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:00<00:01, 27.78it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:00<00:01, 27.00it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:00<00:01, 25.65it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:00<00:00, 23.31it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:01<00:00, 22.47it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:01<00:00, 22.29it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:01<00:00, 22.68it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:01<00:00, 23.12it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:01<00:00, 24.24it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:01<00:00, 24.55it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.056835651397705, 'eval_runtime': 1.9007, 'eval_samples_per_second': 24.201, 'eval_steps_per_second': 24.201, 'epoch': 1.0}\n",
-      " 17%|██████▎                               | 560/3360 [12:31<1:02:53,  1.35s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:01<00:00, 24.32it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 11:22:41,427 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-560\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 11:22:42,026 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 11:22:42,027 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 11:22:42,060 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-560/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 11:22:42,060 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-560/special_tokens_map.json\n",
-      "{'loss': 1.996, 'grad_norm': 2.9073429107666016, 'learning_rate': 9.852982837266955e-05, 'epoch': 1.02}\n",
-      "{'loss': 1.7941, 'grad_norm': 3.4045894145965576, 'learning_rate': 9.840217551150706e-05, 'epoch': 1.04}\n",
-      "{'loss': 1.9779, 'grad_norm': 2.8464860916137695, 'learning_rate': 9.826929872276255e-05, 'epoch': 1.05}\n",
-      "{'loss': 1.92, 'grad_norm': 3.770984411239624, 'learning_rate': 9.81312123475006e-05, 'epoch': 1.07}\n",
-      "{'loss': 1.8683, 'grad_norm': 3.4236226081848145, 'learning_rate': 9.798793128904356e-05, 'epoch': 1.09}\n",
-      "{'loss': 1.9201, 'grad_norm': 4.08709716796875, 'learning_rate': 9.78394710113631e-05, 'epoch': 1.11}\n",
-      "{'loss': 1.8563, 'grad_norm': 3.362687349319458, 'learning_rate': 9.768584753741134e-05, 'epoch': 1.12}\n",
-      "{'loss': 1.913, 'grad_norm': 5.210264682769775, 'learning_rate': 9.752707744739145e-05, 'epoch': 1.14}\n",
-      "{'loss': 1.9273, 'grad_norm': 3.515490770339966, 'learning_rate': 9.736317787696816e-05, 'epoch': 1.16}\n",
-      "{'loss': 1.8016, 'grad_norm': 3.4942610263824463, 'learning_rate': 9.719416651541839e-05, 'epoch': 1.18}\n",
-      "{'loss': 1.7993, 'grad_norm': 2.7268266677856445, 'learning_rate': 9.702006160372209e-05, 'epoch': 1.2}\n",
-      "{'loss': 1.9155, 'grad_norm': 3.6193785667419434, 'learning_rate': 9.684088193259355e-05, 'epoch': 1.21}\n",
-      "{'loss': 1.8261, 'grad_norm': 4.29509973526001, 'learning_rate': 9.665664684045333e-05, 'epoch': 1.23}\n",
-      "{'loss': 1.9301, 'grad_norm': 4.692563056945801, 'learning_rate': 9.646737621134112e-05, 'epoch': 1.25}\n",
-      "{'loss': 1.8418, 'grad_norm': 4.545106410980225, 'learning_rate': 9.627309047276974e-05, 'epoch': 1.27}\n",
-      "{'loss': 2.0611, 'grad_norm': 4.3200860023498535, 'learning_rate': 9.607381059352038e-05, 'epoch': 1.29}\n",
-      "{'loss': 1.9531, 'grad_norm': 3.2151238918304443, 'learning_rate': 9.586955808137958e-05, 'epoch': 1.3}\n",
-      "{'loss': 1.9447, 'grad_norm': 3.385021209716797, 'learning_rate': 9.566035498081784e-05, 'epoch': 1.32}\n",
-      "{'loss': 1.9424, 'grad_norm': 8.94682502746582, 'learning_rate': 9.544622387061055e-05, 'epoch': 1.34}\n",
-      "{'loss': 1.706, 'grad_norm': 4.064428806304932, 'learning_rate': 9.522718786140097e-05, 'epoch': 1.36}\n",
-      "{'loss': 1.9165, 'grad_norm': 4.604166507720947, 'learning_rate': 9.500327059320606e-05, 'epoch': 1.37}\n",
-      "{'loss': 1.7816, 'grad_norm': 5.32956600189209, 'learning_rate': 9.477449623286505e-05, 'epoch': 1.39}\n",
-      "{'loss': 1.6637, 'grad_norm': 3.613009214401245, 'learning_rate': 9.454088947143116e-05, 'epoch': 1.41}\n",
-      "{'loss': 1.9416, 'grad_norm': 4.8296799659729, 'learning_rate': 9.430247552150673e-05, 'epoch': 1.43}\n",
-      "{'loss': 1.8371, 'grad_norm': 4.565757751464844, 'learning_rate': 9.405928011452211e-05, 'epoch': 1.45}\n",
-      "{'loss': 1.846, 'grad_norm': 3.5512914657592773, 'learning_rate': 9.381132949795861e-05, 'epoch': 1.46}\n",
-      "{'loss': 2.0069, 'grad_norm': 3.9040660858154297, 'learning_rate': 9.35586504325155e-05, 'epoch': 1.48}\n",
-      "{'loss': 1.8083, 'grad_norm': 3.609498977661133, 'learning_rate': 9.330127018922194e-05, 'epoch': 1.5}\n",
-      "{'loss': 1.7487, 'grad_norm': 3.3245325088500977, 'learning_rate': 9.303921654649362e-05, 'epoch': 1.52}\n",
-      "{'loss': 1.8764, 'grad_norm': 4.417221546173096, 'learning_rate': 9.277251778713474e-05, 'epoch': 1.54}\n",
-      "{'loss': 1.8843, 'grad_norm': 4.959105014801025, 'learning_rate': 9.250120269528546e-05, 'epoch': 1.55}\n",
-      "{'loss': 1.793, 'grad_norm': 3.7974698543548584, 'learning_rate': 9.22253005533154e-05, 'epoch': 1.57}\n",
-      "{'loss': 1.9039, 'grad_norm': 3.882502555847168, 'learning_rate': 9.194484113866313e-05, 'epoch': 1.59}\n",
-      "{'loss': 1.9854, 'grad_norm': 3.416905164718628, 'learning_rate': 9.165985472062246e-05, 'epoch': 1.61}\n",
-      "{'loss': 1.7529, 'grad_norm': 3.456245183944702, 'learning_rate': 9.137037205707552e-05, 'epoch': 1.62}\n",
-      "{'loss': 1.8017, 'grad_norm': 3.490054130554199, 'learning_rate': 9.107642439117321e-05, 'epoch': 1.64}\n",
-      "{'loss': 1.8225, 'grad_norm': 3.2115142345428467, 'learning_rate': 9.077804344796302e-05, 'epoch': 1.66}\n",
-      "{'loss': 1.8333, 'grad_norm': 3.5726113319396973, 'learning_rate': 9.04752614309652e-05, 'epoch': 1.68}\n",
-      "{'loss': 1.7861, 'grad_norm': 3.9323503971099854, 'learning_rate': 9.01681110186971e-05, 'epoch': 1.7}\n",
-      "{'loss': 1.8067, 'grad_norm': 4.4842352867126465, 'learning_rate': 8.985662536114613e-05, 'epoch': 1.71}\n",
-      "{'loss': 1.8397, 'grad_norm': 3.1608762741088867, 'learning_rate': 8.954083807619208e-05, 'epoch': 1.73}\n",
-      "{'loss': 1.9411, 'grad_norm': 3.920475959777832, 'learning_rate': 8.922078324597879e-05, 'epoch': 1.75}\n",
-      "{'loss': 1.8974, 'grad_norm': 3.438220739364624, 'learning_rate': 8.889649541323574e-05, 'epoch': 1.77}\n",
-      "{'loss': 1.8202, 'grad_norm': 4.780834674835205, 'learning_rate': 8.856800957755e-05, 'epoch': 1.78}\n",
-      "{'loss': 1.8528, 'grad_norm': 3.768432378768921, 'learning_rate': 8.823536119158864e-05, 'epoch': 1.8}\n",
-      "{'loss': 1.753, 'grad_norm': 4.07826042175293, 'learning_rate': 8.789858615727265e-05, 'epoch': 1.82}\n",
-      "{'loss': 1.7389, 'grad_norm': 3.5676631927490234, 'learning_rate': 8.755772082190194e-05, 'epoch': 1.84}\n",
-      "{'loss': 1.9198, 'grad_norm': 3.463003635406494, 'learning_rate': 8.721280197423258e-05, 'epoch': 1.86}\n",
-      "{'loss': 1.7722, 'grad_norm': 4.634316921234131, 'learning_rate': 8.68638668405062e-05, 'epoch': 1.87}\n",
-      "{'loss': 1.8237, 'grad_norm': 4.284477710723877, 'learning_rate': 8.651095308043232e-05, 'epoch': 1.89}\n",
-      "{'loss': 2.0051, 'grad_norm': 4.610734462738037, 'learning_rate': 8.61540987831238e-05, 'epoch': 1.91}\n",
-      "{'loss': 1.9493, 'grad_norm': 4.1395392417907715, 'learning_rate': 8.579334246298593e-05, 'epoch': 1.93}\n",
-      "{'loss': 1.8477, 'grad_norm': 3.6301958560943604, 'learning_rate': 8.542872305555978e-05, 'epoch': 1.95}\n",
-      "{'loss': 1.7, 'grad_norm': 3.8048858642578125, 'learning_rate': 8.50602799133199e-05, 'epoch': 1.96}\n",
-      "{'loss': 1.8371, 'grad_norm': 3.2337429523468018, 'learning_rate': 8.468805280142709e-05, 'epoch': 1.98}\n",
-      "{'loss': 1.8531, 'grad_norm': 4.216500282287598, 'learning_rate': 8.43120818934367e-05, 'epoch': 2.0}\n",
-      " 33%|█████████████                          | 1120/3360 [25:00<49:13,  1.32s/it][INFO|trainer.py:3788] 2024-07-04 11:35:10,200 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 11:35:10,200 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 11:35:10,200 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:00<00:01, 29.06it/s]\u001b[A\n",
-      " 15%|██████▋                                     | 7/46 [00:00<00:01, 26.71it/s]\u001b[A\n",
-      " 22%|█████████▎                                 | 10/46 [00:00<00:01, 25.85it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:00<00:01, 24.50it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:00<00:01, 24.61it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:00<00:01, 25.00it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:00<00:00, 24.89it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:00<00:00, 24.70it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:01<00:00, 24.56it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:01<00:00, 24.07it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:01<00:00, 24.64it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:01<00:00, 24.64it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:01<00:00, 23.45it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:01<00:00, 24.22it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.01658296585083, 'eval_runtime': 1.9249, 'eval_samples_per_second': 23.898, 'eval_steps_per_second': 23.898, 'epoch': 2.0}\n",
-      " 33%|█████████████                          | 1120/3360 [25:02<49:13,  1.32s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:01<00:00, 23.38it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 11:35:12,127 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-1120\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 11:35:13,176 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 11:35:13,177 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 11:35:13,210 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-1120/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 11:35:13,211 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-1120/special_tokens_map.json\n",
-      "{'loss': 1.5674, 'grad_norm': 4.559268474578857, 'learning_rate': 8.393240776696274e-05, 'epoch': 2.02}\n",
-      "{'loss': 1.4393, 'grad_norm': 3.3662822246551514, 'learning_rate': 8.354907139929851e-05, 'epoch': 2.03}\n",
-      "{'loss': 1.5166, 'grad_norm': 4.587384223937988, 'learning_rate': 8.316211416299397e-05, 'epoch': 2.05}\n",
-      "{'loss': 1.4818, 'grad_norm': 5.713983535766602, 'learning_rate': 8.27715778213905e-05, 'epoch': 2.07}\n",
-      "{'loss': 1.3679, 'grad_norm': 3.7478792667388916, 'learning_rate': 8.237750452411353e-05, 'epoch': 2.09}\n",
-      "{'loss': 1.4682, 'grad_norm': 3.7805116176605225, 'learning_rate': 8.197993680252334e-05, 'epoch': 2.11}\n",
-      "{'loss': 1.6848, 'grad_norm': 4.318390846252441, 'learning_rate': 8.157891756512488e-05, 'epoch': 2.12}\n",
-      "{'loss': 1.447, 'grad_norm': 4.625955581665039, 'learning_rate': 8.117449009293668e-05, 'epoch': 2.14}\n",
-      "{'loss': 1.4888, 'grad_norm': 4.70202112197876, 'learning_rate': 8.076669803481965e-05, 'epoch': 2.16}\n",
-      "{'loss': 1.5405, 'grad_norm': 6.126914978027344, 'learning_rate': 8.035558540276618e-05, 'epoch': 2.18}\n",
-      "{'loss': 1.4751, 'grad_norm': 3.867528200149536, 'learning_rate': 7.994119656715002e-05, 'epoch': 2.2}\n",
-      "{'loss': 1.5175, 'grad_norm': 4.935867786407471, 'learning_rate': 7.952357625193749e-05, 'epoch': 2.21}\n",
-      "{'loss': 1.5586, 'grad_norm': 5.28302001953125, 'learning_rate': 7.91027695298606e-05, 'epoch': 2.23}\n",
-      "{'loss': 1.5798, 'grad_norm': 4.9564738273620605, 'learning_rate': 7.86788218175523e-05, 'epoch': 2.25}\n",
-      "{'loss': 1.4184, 'grad_norm': 4.7498779296875, 'learning_rate': 7.8251778870645e-05, 'epoch': 2.27}\n",
-      "{'loss': 1.4736, 'grad_norm': 5.780045032501221, 'learning_rate': 7.782168677883206e-05, 'epoch': 2.28}\n",
-      "{'loss': 1.5192, 'grad_norm': 3.647230625152588, 'learning_rate': 7.738859196089358e-05, 'epoch': 2.3}\n",
-      "{'loss': 1.5836, 'grad_norm': 4.818410396575928, 'learning_rate': 7.695254115968648e-05, 'epoch': 2.32}\n",
-      "{'loss': 1.6111, 'grad_norm': 4.5074286460876465, 'learning_rate': 7.651358143709972e-05, 'epoch': 2.34}\n",
-      "{'loss': 1.6122, 'grad_norm': 4.6216816902160645, 'learning_rate': 7.60717601689749e-05, 'epoch': 2.36}\n",
-      "{'loss': 1.5633, 'grad_norm': 9.873260498046875, 'learning_rate': 7.562712503999327e-05, 'epoch': 2.37}\n",
-      "{'loss': 1.7444, 'grad_norm': 4.795359134674072, 'learning_rate': 7.517972403852905e-05, 'epoch': 2.39}\n",
-      "{'loss': 1.5804, 'grad_norm': 4.818080902099609, 'learning_rate': 7.472960545147038e-05, 'epoch': 2.41}\n",
-      "{'loss': 1.4748, 'grad_norm': 5.576250076293945, 'learning_rate': 7.427681785900761e-05, 'epoch': 2.43}\n",
-      "{'loss': 1.5531, 'grad_norm': 4.261260509490967, 'learning_rate': 7.382141012939034e-05, 'epoch': 2.45}\n",
-      "{'loss': 1.4554, 'grad_norm': 4.23293399810791, 'learning_rate': 7.33634314136531e-05, 'epoch': 2.46}\n",
-      "{'loss': 1.5272, 'grad_norm': 4.627878665924072, 'learning_rate': 7.290293114031061e-05, 'epoch': 2.48}\n",
-      "{'loss': 1.6616, 'grad_norm': 4.36018705368042, 'learning_rate': 7.243995901002312e-05, 'epoch': 2.5}\n",
-      "{'loss': 1.5503, 'grad_norm': 5.698966026306152, 'learning_rate': 7.197456499023225e-05, 'epoch': 2.52}\n",
-      "{'loss': 1.5043, 'grad_norm': 4.486359119415283, 'learning_rate': 7.150679930976825e-05, 'epoch': 2.53}\n",
-      "{'loss': 1.5796, 'grad_norm': 8.031678199768066, 'learning_rate': 7.103671245342887e-05, 'epoch': 2.55}\n",
-      "{'loss': 1.4317, 'grad_norm': 5.806405544281006, 'learning_rate': 7.056435515653059e-05, 'epoch': 2.57}\n",
-      "{'loss': 1.696, 'grad_norm': 6.584068298339844, 'learning_rate': 7.008977839943299e-05, 'epoch': 2.59}\n",
-      "{'loss': 1.4768, 'grad_norm': 4.871330261230469, 'learning_rate': 6.961303340203653e-05, 'epoch': 2.61}\n",
-      "{'loss': 1.587, 'grad_norm': 3.9512643814086914, 'learning_rate': 6.91341716182545e-05, 'epoch': 2.62}\n",
-      "{'loss': 1.4991, 'grad_norm': 3.4907033443450928, 'learning_rate': 6.86532447304597e-05, 'epoch': 2.64}\n",
-      "{'loss': 1.4822, 'grad_norm': 4.603860855102539, 'learning_rate': 6.817030464390656e-05, 'epoch': 2.66}\n",
-      "{'loss': 1.6408, 'grad_norm': 5.737949371337891, 'learning_rate': 6.768540348112907e-05, 'epoch': 2.68}\n",
-      "{'loss': 1.4316, 'grad_norm': 5.838085174560547, 'learning_rate': 6.719859357631535e-05, 'epoch': 2.7}\n",
-      "{'loss': 1.414, 'grad_norm': 5.460419654846191, 'learning_rate': 6.670992746965938e-05, 'epoch': 2.71}\n",
-      "{'loss': 1.6858, 'grad_norm': 5.311679363250732, 'learning_rate': 6.621945790169036e-05, 'epoch': 2.73}\n",
-      "{'loss': 1.5802, 'grad_norm': 4.987999439239502, 'learning_rate': 6.572723780758069e-05, 'epoch': 2.75}\n",
-      "{'loss': 1.5672, 'grad_norm': 5.01920223236084, 'learning_rate': 6.523332031143272e-05, 'epoch': 2.77}\n",
-      "{'loss': 1.5914, 'grad_norm': 4.382671356201172, 'learning_rate': 6.473775872054521e-05, 'epoch': 2.78}\n",
-      "{'loss': 1.4284, 'grad_norm': 3.818115711212158, 'learning_rate': 6.424060651966007e-05, 'epoch': 2.8}\n",
-      "{'loss': 1.499, 'grad_norm': 4.427730560302734, 'learning_rate': 6.374191736518974e-05, 'epoch': 2.82}\n",
-      "{'loss': 1.4914, 'grad_norm': 4.508190631866455, 'learning_rate': 6.324174507942637e-05, 'epoch': 2.84}\n",
-      "{'loss': 1.4629, 'grad_norm': 6.055968284606934, 'learning_rate': 6.274014364473274e-05, 'epoch': 2.86}\n",
-      "{'loss': 1.717, 'grad_norm': 4.5250678062438965, 'learning_rate': 6.22371671977162e-05, 'epoch': 2.87}\n",
-      "{'loss': 1.5103, 'grad_norm': 4.378949165344238, 'learning_rate': 6.173287002338577e-05, 'epoch': 2.89}\n",
-      "{'loss': 1.511, 'grad_norm': 5.3176751136779785, 'learning_rate': 6.122730654929334e-05, 'epoch': 2.91}\n",
-      "{'loss': 1.4656, 'grad_norm': 4.5037994384765625, 'learning_rate': 6.072053133965938e-05, 'epoch': 2.93}\n",
-      "{'loss': 1.6443, 'grad_norm': 4.189935684204102, 'learning_rate': 6.021259908948402e-05, 'epoch': 2.95}\n",
-      "{'loss': 1.6633, 'grad_norm': 4.525129795074463, 'learning_rate': 5.970356461864391e-05, 'epoch': 2.96}\n",
-      "{'loss': 1.4935, 'grad_norm': 5.440227508544922, 'learning_rate': 5.919348286597569e-05, 'epoch': 2.98}\n",
-      "{'loss': 1.6304, 'grad_norm': 4.765013694763184, 'learning_rate': 5.868240888334653e-05, 'epoch': 3.0}\n",
-      " 50%|███████████████████▌                   | 1680/3360 [37:13<35:24,  1.26s/it][INFO|trainer.py:3788] 2024-07-04 11:47:23,337 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 11:47:23,337 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 11:47:23,337 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:00<00:01, 36.22it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:00<00:01, 30.18it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:00<00:01, 28.29it/s]\u001b[A\n",
-      " 33%|██████████���███                             | 15/46 [00:00<00:01, 27.11it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:00<00:01, 25.76it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:00<00:00, 26.14it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:00<00:00, 25.68it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:01<00:00, 25.54it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:01<00:00, 25.92it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:01<00:00, 25.97it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:01<00:00, 26.05it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:01<00:00, 26.26it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:01<00:00, 26.26it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.0813467502593994, 'eval_runtime': 1.8021, 'eval_samples_per_second': 25.525, 'eval_steps_per_second': 25.525, 'epoch': 3.0}\n",
-      " 50%|███████████████████▌                   | 1680/3360 [37:15<35:24,  1.26s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:01<00:00, 24.30it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 11:47:25,141 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-1680\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 11:47:25,920 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 11:47:25,920 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 11:47:25,956 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-1680/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 11:47:25,956 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-1680/special_tokens_map.json\n",
-      "{'loss': 1.4346, 'grad_norm': 4.2551727294921875, 'learning_rate': 5.8170397829712485e-05, 'epoch': 3.02}\n",
-      "{'loss': 1.1148, 'grad_norm': 4.442202568054199, 'learning_rate': 5.765750496516547e-05, 'epoch': 3.03}\n",
-      "{'loss': 1.2852, 'grad_norm': 5.140079021453857, 'learning_rate': 5.714378564496901e-05, 'epoch': 3.05}\n",
-      "{'loss': 1.2086, 'grad_norm': 5.270480632781982, 'learning_rate': 5.6629295313583974e-05, 'epoch': 3.07}\n",
-      "{'loss': 1.1824, 'grad_norm': 5.192230224609375, 'learning_rate': 5.611408949868457e-05, 'epoch': 3.09}\n",
-      "{'loss': 1.2875, 'grad_norm': 5.830446720123291, 'learning_rate': 5.559822380516539e-05, 'epoch': 3.11}\n",
-      "{'loss': 1.2162, 'grad_norm': 4.606627464294434, 'learning_rate': 5.5081753909140096e-05, 'epoch': 3.12}\n",
-      "{'loss': 1.3341, 'grad_norm': 5.547798156738281, 'learning_rate': 5.456473555193242e-05, 'epoch': 3.14}\n",
-      "{'loss': 1.2143, 'grad_norm': 5.579686641693115, 'learning_rate': 5.404722453406017e-05, 'epoch': 3.16}\n",
-      "{'loss': 1.2823, 'grad_norm': 6.129615783691406, 'learning_rate': 5.3529276709212816e-05, 'epoch': 3.18}\n",
-      "{'loss': 1.2582, 'grad_norm': 5.295398712158203, 'learning_rate': 5.30109479782233e-05, 'epoch': 3.2}\n",
-      "{'loss': 1.2438, 'grad_norm': 6.145551681518555, 'learning_rate': 5.249229428303486e-05, 'epoch': 3.21}\n",
-      "{'loss': 1.4372, 'grad_norm': 4.52131986618042, 'learning_rate': 5.197337160066331e-05, 'epoch': 3.23}\n",
-      "{'loss': 1.2346, 'grad_norm': 4.7772955894470215, 'learning_rate': 5.145423593715557e-05, 'epoch': 3.25}\n",
-      "{'loss': 1.1929, 'grad_norm': 5.328940391540527, 'learning_rate': 5.0934943321545115e-05, 'epoch': 3.27}\n",
-      "{'loss': 1.1731, 'grad_norm': 5.733246803283691, 'learning_rate': 5.041554979980486e-05, 'epoch': 3.28}\n",
-      "{'loss': 1.3325, 'grad_norm': 6.418582439422607, 'learning_rate': 4.9896111428798254e-05, 'epoch': 3.3}\n",
-      "{'loss': 1.3305, 'grad_norm': 4.787232398986816, 'learning_rate': 4.9376684270229254e-05, 'epoch': 3.32}\n",
-      "{'loss': 1.2982, 'grad_norm': 4.655210971832275, 'learning_rate': 4.8857324384591653e-05, 'epoch': 3.34}\n",
-      "{'loss': 1.2833, 'grad_norm': 4.85659122467041, 'learning_rate': 4.8338087825118675e-05, 'epoch': 3.36}\n",
-      "{'loss': 1.2835, 'grad_norm': 5.313413143157959, 'learning_rate': 4.781903063173321e-05, 'epoch': 3.37}\n",
-      "{'loss': 1.2001, 'grad_norm': 4.640489101409912, 'learning_rate': 4.730020882499964e-05, 'epoch': 3.39}\n",
-      "{'loss': 1.2597, 'grad_norm': 6.197988033294678, 'learning_rate': 4.678167840007767e-05, 'epoch': 3.41}\n",
-      "{'loss': 1.3514, 'grad_norm': 4.942805290222168, 'learning_rate': 4.626349532067879e-05, 'epoch': 3.43}\n",
-      "{'loss': 1.3118, 'grad_norm': 5.112833499908447, 'learning_rate': 4.574571551302647e-05, 'epoch': 3.44}\n",
-      "{'loss': 1.3232, 'grad_norm': 4.470940113067627, 'learning_rate': 4.522839485981994e-05, 'epoch': 3.46}\n",
-      "{'loss': 1.2533, 'grad_norm': 5.801645755767822, 'learning_rate': 4.471158919420312e-05, 'epoch': 3.48}\n",
-      "{'loss': 1.2343, 'grad_norm': 6.3296709060668945, 'learning_rate': 4.4195354293738484e-05, 'epoch': 3.5}\n",
-      "{'loss': 1.1995, 'grad_norm': 6.262467384338379, 'learning_rate': 4.367974587438733e-05, 'epoch': 3.52}\n",
-      "{'loss': 1.2744, 'grad_norm': 5.313882827758789, 'learning_rate': 4.316481958449634e-05, 'epoch': 3.53}\n",
-      "{'loss': 1.2366, 'grad_norm': 7.450092792510986, 'learning_rate': 4.2650630998791615e-05, 'epoch': 3.55}\n",
-      "{'loss': 1.3738, 'grad_norm': 4.7678680419921875, 'learning_rate': 4.213723561238074e-05, 'epoch': 3.57}\n",
-      "{'loss': 1.1538, 'grad_norm': 4.40903377532959, 'learning_rate': 4.162468883476319e-05, 'epoch': 3.59}\n",
-      "{'loss': 1.2502, 'grad_norm': 5.227618217468262, 'learning_rate': 4.111304598385018e-05, 'epoch': 3.61}\n",
-      "{'loss': 1.3061, 'grad_norm': 6.307828903198242, 'learning_rate': 4.060236227999441e-05, 'epoch': 3.62}\n",
-      "{'loss': 1.2667, 'grad_norm': 5.422544002532959, 'learning_rate': 4.0092692840030134e-05, 'epoch': 3.64}\n",
-      "{'loss': 1.2039, 'grad_norm': 7.9964141845703125, 'learning_rate': 3.9584092671324606e-05, 'epoch': 3.66}\n",
-      "{'loss': 1.3509, 'grad_norm': 7.364163875579834, 'learning_rate': 3.907661666584131e-05, 'epoch': 3.68}\n",
-      "{'loss': 1.3427, 'grad_norm': 6.175056457519531, 'learning_rate': 3.857031959421553e-05, 'epoch': 3.69}\n",
-      "{'loss': 1.345, 'grad_norm': 5.0636725425720215, 'learning_rate': 3.806525609984312e-05, 'epoch': 3.71}\n",
-      "{'loss': 1.1779, 'grad_norm': 5.742904186248779, 'learning_rate': 3.7561480692983006e-05, 'epoch': 3.73}\n",
-      "{'loss': 1.29, 'grad_norm': 5.6552276611328125, 'learning_rate': 3.705904774487396e-05, 'epoch': 3.75}\n",
-      "{'loss': 1.2704, 'grad_norm': 5.890940189361572, 'learning_rate': 3.655801148186655e-05, 'epoch': 3.77}\n",
-      "{'loss': 1.1811, 'grad_norm': 5.2217583656311035, 'learning_rate': 3.6058425979570485e-05, 'epoch': 3.78}\n",
-      "{'loss': 1.2768, 'grad_norm': 5.42200231552124, 'learning_rate': 3.556034515701852e-05, 'epoch': 3.8}\n",
-      "{'loss': 1.2891, 'grad_norm': 5.615239143371582, 'learning_rate': 3.506382277084696e-05, 'epoch': 3.82}\n",
-      "{'loss': 1.2401, 'grad_norm': 5.646175861358643, 'learning_rate': 3.4568912409493945e-05, 'epoch': 3.84}\n",
-      "{'loss': 1.0597, 'grad_norm': 5.7333197593688965, 'learning_rate': 3.4075667487415785e-05, 'epoch': 3.86}\n",
-      "{'loss': 1.1621, 'grad_norm': 5.321319580078125, 'learning_rate': 3.358414123932195e-05, 'epoch': 3.87}\n",
-      "{'loss': 1.2736, 'grad_norm': 4.852396011352539, 'learning_rate': 3.3094386714429724e-05, 'epoch': 3.89}\n",
-      "{'loss': 1.2597, 'grad_norm': 7.163392066955566, 'learning_rate': 3.2606456770738636e-05, 'epoch': 3.91}\n",
-      "{'loss': 1.1871, 'grad_norm': 5.611868381500244, 'learning_rate': 3.212040406932569e-05, 'epoch': 3.93}\n",
-      "{'loss': 1.0307, 'grad_norm': 4.783786296844482, 'learning_rate': 3.163628106866172e-05, 'epoch': 3.94}\n",
-      "{'loss': 1.4526, 'grad_norm': 5.691711902618408, 'learning_rate': 3.115414001894974e-05, 'epoch': 3.96}\n",
-      "{'loss': 1.3101, 'grad_norm': 5.280589580535889, 'learning_rate': 3.067403295648566e-05, 'epoch': 3.98}\n",
-      "{'loss': 1.0932, 'grad_norm': 4.22761869430542, 'learning_rate': 3.019601169804216e-05, 'epoch': 4.0}\n",
-      " 67%|██████████████████████████             | 2240/3360 [49:01<23:24,  1.25s/it][INFO|trainer.py:3788] 2024-07-04 11:59:10,533 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 11:59:10,533 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 11:59:10,533 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:00<00:01, 37.50it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:00<00:01, 29.67it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:00<00:01, 27.86it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:00<00:01, 27.79it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:00<00:01, 27.74it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:00<00:00, 27.55it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:00<00:00, 26.89it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:00<00:00, 26.15it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:01<00:00, 26.12it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:01<00:00, 25.32it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:01<00:00, 26.12it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:01<00:00, 26.56it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:01<00:00, 26.71it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.2238643169403076, 'eval_runtime': 1.7413, 'eval_samples_per_second': 26.417, 'eval_steps_per_second': 26.417, 'epoch': 4.0}\n",
-      " 67%|██████████████████████████             | 2240/3360 [49:02<23:24,  1.25s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:01<00:00, 26.68it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 11:59:12,277 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-2240\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 11:59:13,447 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 11:59:13,448 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 11:59:13,487 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-2240/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 11:59:13,487 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-2240/special_tokens_map.json\n",
-      "{'loss': 1.1326, 'grad_norm': 3.8860654830932617, 'learning_rate': 2.9720127835276256e-05, 'epoch': 4.02}\n",
-      "{'loss': 1.0202, 'grad_norm': 5.21559476852417, 'learning_rate': 2.9246432729161055e-05, 'epoch': 4.03}\n",
-      "{'loss': 1.0934, 'grad_norm': 5.658751964569092, 'learning_rate': 2.8774977504442647e-05, 'epoch': 4.05}\n",
-      "{'loss': 0.9934, 'grad_norm': 5.090124130249023, 'learning_rate': 2.8305813044122097e-05, 'epoch': 4.07}\n",
-      "{'loss': 1.0309, 'grad_norm': 5.851395606994629, 'learning_rate': 2.7838989983964065e-05, 'epoch': 4.09}\n",
-      "{'loss': 1.187, 'grad_norm': 4.703646659851074, 'learning_rate': 2.737455870703155e-05, 'epoch': 4.11}\n",
-      "{'loss': 0.9171, 'grad_norm': 4.95070219039917, 'learning_rate': 2.6912569338248315e-05, 'epoch': 4.12}\n",
-      "{'loss': 1.0232, 'grad_norm': 4.933461666107178, 'learning_rate': 2.645307173898901e-05, 'epoch': 4.14}\n",
-      "{'loss': 1.1005, 'grad_norm': 5.395535469055176, 'learning_rate': 2.5996115501697694e-05, 'epoch': 4.16}\n",
-      "{'loss': 0.9827, 'grad_norm': 4.670980453491211, 'learning_rate': 2.5541749944535554e-05, 'epoch': 4.18}\n",
-      "{'loss': 0.9969, 'grad_norm': 9.713501930236816, 'learning_rate': 2.5090024106057962e-05, 'epoch': 4.19}\n",
-      "{'loss': 1.0557, 'grad_norm': 5.423773288726807, 'learning_rate': 2.464098673992205e-05, 'epoch': 4.21}\n",
-      "{'loss': 0.9903, 'grad_norm': 5.628043174743652, 'learning_rate': 2.4194686309624663e-05, 'epoch': 4.23}\n",
-      "{'loss': 1.1397, 'grad_norm': 5.057712554931641, 'learning_rate': 2.3751170983272e-05, 'epoch': 4.25}\n",
-      "{'loss': 0.962, 'grad_norm': 4.844544410705566, 'learning_rate': 2.3310488628380757e-05, 'epoch': 4.27}\n",
-      "{'loss': 1.0187, 'grad_norm': 7.445083141326904, 'learning_rate': 2.2872686806712035e-05, 'epoch': 4.28}\n",
-      "{'loss': 1.0618, 'grad_norm': 4.87847900390625, 'learning_rate': 2.243781276913811e-05, 'epoch': 4.3}\n",
-      "{'loss': 0.9125, 'grad_norm': 5.181140899658203, 'learning_rate': 2.200591345054267e-05, 'epoch': 4.32}\n",
-      "{'loss': 1.125, 'grad_norm': 8.97202205657959, 'learning_rate': 2.157703546475539e-05, 'epoch': 4.34}\n",
-      "{'loss': 0.9747, 'grad_norm': 6.134432792663574, 'learning_rate': 2.115122509952085e-05, 'epoch': 4.36}\n",
-      "{'loss': 0.9803, 'grad_norm': 9.630309104919434, 'learning_rate': 2.0728528311502976e-05, 'epoch': 4.37}\n",
-      "{'loss': 1.0843, 'grad_norm': 5.363273620605469, 'learning_rate': 2.0308990721324927e-05, 'epoch': 4.39}\n",
-      "{'loss': 1.0764, 'grad_norm': 7.712973117828369, 'learning_rate': 1.989265760864542e-05, 'epoch': 4.41}\n",
-      "{'loss': 1.1397, 'grad_norm': 5.690403938293457, 'learning_rate': 1.947957390727185e-05, 'epoch': 4.43}\n",
-      "{'loss': 1.1258, 'grad_norm': 5.744186878204346, 'learning_rate': 1.906978420031059e-05, 'epoch': 4.44}\n",
-      "{'loss': 0.9438, 'grad_norm': 8.820874214172363, 'learning_rate': 1.8663332715355396e-05, 'epoch': 4.46}\n",
-      "{'loss': 1.132, 'grad_norm': 4.420164108276367, 'learning_rate': 1.8260263319713844e-05, 'epoch': 4.48}\n",
-      "{'loss': 1.0819, 'grad_norm': 5.586333751678467, 'learning_rate': 1.7860619515673033e-05, 'epoch': 4.5}\n",
-      "{'loss': 1.0571, 'grad_norm': 5.625140190124512, 'learning_rate': 1.746444443580433e-05, 'epoch': 4.52}\n",
-      "{'loss': 1.1021, 'grad_norm': 5.7560577392578125, 'learning_rate': 1.7071780838308288e-05, 'epoch': 4.53}\n",
-      "{'loss': 1.0531, 'grad_norm': 5.3450727462768555, 'learning_rate': 1.6682671102399805e-05, 'epoch': 4.55}\n",
-      "{'loss': 1.017, 'grad_norm': 6.27817440032959, 'learning_rate': 1.629715722373423e-05, 'epoch': 4.57}\n",
-      "{'loss': 1.0471, 'grad_norm': 5.72844934463501, 'learning_rate': 1.5915280809874932e-05, 'epoch': 4.59}\n",
-      "{'loss': 0.9309, 'grad_norm': 5.988643169403076, 'learning_rate': 1.553708307580265e-05, 'epoch': 4.61}\n",
-      "{'loss': 1.0538, 'grad_norm': 5.950584411621094, 'learning_rate': 1.5162604839467265e-05, 'epoch': 4.62}\n",
-      "{'loss': 1.0554, 'grad_norm': 4.944731712341309, 'learning_rate': 1.4791886517382413e-05, 'epoch': 4.64}\n",
-      "{'loss': 1.0857, 'grad_norm': 6.031637191772461, 'learning_rate': 1.4424968120263504e-05, 'epoch': 4.66}\n",
-      "{'loss': 1.0667, 'grad_norm': 5.933581352233887, 'learning_rate': 1.4061889248709343e-05, 'epoch': 4.68}\n",
-      "{'loss': 0.9942, 'grad_norm': 6.697149276733398, 'learning_rate': 1.370268908892825e-05, 'epoch': 4.69}\n",
-      "{'loss': 1.0146, 'grad_norm': 7.122743129730225, 'learning_rate': 1.3347406408508695e-05, 'epoch': 4.71}\n",
-      "{'loss': 0.9921, 'grad_norm': 4.69237756729126, 'learning_rate': 1.2996079552235263e-05, 'epoch': 4.73}\n",
-      "{'loss': 1.0017, 'grad_norm': 5.421998977661133, 'learning_rate': 1.264874643795021e-05, 'epoch': 4.75}\n",
-      "{'loss': 1.0102, 'grad_norm': 6.121133804321289, 'learning_rate': 1.230544455246101e-05, 'epoch': 4.77}\n",
-      "{'loss': 1.062, 'grad_norm': 5.060891151428223, 'learning_rate': 1.1966210947494583e-05, 'epoch': 4.78}\n",
-      "{'loss': 1.125, 'grad_norm': 4.1661529541015625, 'learning_rate': 1.1631082235698316e-05, 'epoch': 4.8}\n",
-      "{'loss': 0.9848, 'grad_norm': 6.484502792358398, 'learning_rate': 1.130009458668863e-05, 'epoch': 4.82}\n",
-      "{'loss': 0.9632, 'grad_norm': 5.1096086502075195, 'learning_rate': 1.097328372314721e-05, 'epoch': 4.84}\n",
-      "{'loss': 1.0435, 'grad_norm': 7.69472074508667, 'learning_rate': 1.0650684916965559e-05, 'epoch': 4.85}\n",
-      "{'loss': 1.1122, 'grad_norm': 6.654355525970459, 'learning_rate': 1.0332332985438248e-05, 'epoch': 4.87}\n",
-      "{'loss': 1.0846, 'grad_norm': 6.456166744232178, 'learning_rate': 1.0018262287505086e-05, 'epoch': 4.89}\n",
-      "{'loss': 1.1491, 'grad_norm': 5.556300163269043, 'learning_rate': 9.708506720042932e-06, 'epoch': 4.91}\n",
-      "{'loss': 1.0227, 'grad_norm': 5.921450614929199, 'learning_rate': 9.403099714207175e-06, 'epoch': 4.93}\n",
-      "{'loss': 1.13, 'grad_norm': 5.2472052574157715, 'learning_rate': 9.102074231823727e-06, 'epoch': 4.94}\n",
-      "{'loss': 1.1184, 'grad_norm': 6.798206806182861, 'learning_rate': 8.805462761831418e-06, 'epoch': 4.96}\n",
-      "{'loss': 1.1483, 'grad_norm': 6.1544647216796875, 'learning_rate': 8.513297316775625e-06, 'epoch': 4.98}\n",
-      "{'loss': 1.0966, 'grad_norm': 5.619192600250244, 'learning_rate': 8.225609429353187e-06, 'epoch': 5.0}\n",
-      " 83%|██████████████████████████████▊      | 2800/3360 [1:00:45<11:43,  1.26s/it][INFO|trainer.py:3788] 2024-07-04 12:10:55,158 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 12:10:55,158 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 12:10:55,158 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:00<00:01, 31.69it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:00<00:01, 26.12it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:00<00:01, 23.11it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:00<00:01, 23.84it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:00<00:01, 24.23it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:00<00:01, 24.91it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:00<00:00, 25.15it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:01<00:00, 25.17it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:01<00:00, 23.97it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:01<00:00, 24.14it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:01<00:00, 24.94it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:01<00:00, 25.29it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:01<00:00, 25.29it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.439286708831787, 'eval_runtime': 1.9084, 'eval_samples_per_second': 24.104, 'eval_steps_per_second': 24.104, 'epoch': 5.0}\n",
-      " 83%|██████████████████████████████▊      | 2800/3360 [1:00:47<11:43,  1.26s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:01<00:00, 25.52it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 12:10:57,069 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-2800\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 12:10:57,881 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 12:10:57,882 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 12:10:57,908 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-2800/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 12:10:57,908 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-2800/special_tokens_map.json\n",
-      "{'loss': 0.9401, 'grad_norm': 5.465145587921143, 'learning_rate': 7.942430149009161e-06, 'epoch': 5.02}\n",
-      "{'loss': 0.9053, 'grad_norm': 5.05084228515625, 'learning_rate': 7.663790038585793e-06, 'epoch': 5.03}\n",
-      "{'loss': 0.9659, 'grad_norm': 6.576834678649902, 'learning_rate': 7.389719171023857e-06, 'epoch': 5.05}\n",
-      "{'loss': 0.9701, 'grad_norm': 6.316474437713623, 'learning_rate': 7.1202471261170245e-06, 'epoch': 5.07}\n",
-      "{'loss': 0.9986, 'grad_norm': 5.003658771514893, 'learning_rate': 6.855402987319348e-06, 'epoch': 5.09}\n",
-      "{'loss': 1.0909, 'grad_norm': 5.66398286819458, 'learning_rate': 6.595215338606397e-06, 'epoch': 5.1}\n",
-      "{'loss': 0.8373, 'grad_norm': 5.239981174468994, 'learning_rate': 6.339712261390213e-06, 'epoch': 5.12}\n",
-      "{'loss': 1.0119, 'grad_norm': 5.830501079559326, 'learning_rate': 6.088921331488568e-06, 'epoch': 5.14}\n",
-      "{'loss': 0.89, 'grad_norm': 5.683416366577148, 'learning_rate': 5.8428696161488215e-06, 'epoch': 5.16}\n",
-      "{'loss': 0.8321, 'grad_norm': 5.024005889892578, 'learning_rate': 5.601583671126531e-06, 'epoch': 5.18}\n",
-      "{'loss': 0.9924, 'grad_norm': 5.65994930267334, 'learning_rate': 5.365089537819434e-06, 'epoch': 5.19}\n",
-      "{'loss': 0.982, 'grad_norm': 5.285236835479736, 'learning_rate': 5.133412740456806e-06, 'epoch': 5.21}\n",
-      "{'loss': 0.988, 'grad_norm': 6.087540149688721, 'learning_rate': 4.906578283344759e-06, 'epoch': 5.23}\n",
-      "{'loss': 1.0628, 'grad_norm': 5.564962863922119, 'learning_rate': 4.684610648167503e-06, 'epoch': 5.25}\n",
-      "{'loss': 0.9339, 'grad_norm': 5.311854362487793, 'learning_rate': 4.467533791345191e-06, 'epoch': 5.27}\n",
-      "{'loss': 0.9112, 'grad_norm': 6.383027076721191, 'learning_rate': 4.255371141448272e-06, 'epoch': 5.28}\n",
-      "{'loss': 0.8851, 'grad_norm': 5.323634147644043, 'learning_rate': 4.048145596668967e-06, 'epoch': 5.3}\n",
-      "{'loss': 0.9129, 'grad_norm': 5.474393844604492, 'learning_rate': 3.84587952234991e-06, 'epoch': 5.32}\n",
-      "{'loss': 0.8769, 'grad_norm': 6.140456676483154, 'learning_rate': 3.6485947485702832e-06, 'epoch': 5.34}\n",
-      "{'loss': 0.9177, 'grad_norm': 5.710687637329102, 'learning_rate': 3.4563125677897932e-06, 'epoch': 5.35}\n",
-      "{'loss': 0.9235, 'grad_norm': 6.829979419708252, 'learning_rate': 3.269053732550581e-06, 'epoch': 5.37}\n",
-      "{'loss': 0.9744, 'grad_norm': 6.83032751083374, 'learning_rate': 3.086838453237506e-06, 'epoch': 5.39}\n",
-      "{'loss': 0.7769, 'grad_norm': 5.491135597229004, 'learning_rate': 2.9096863958968268e-06, 'epoch': 5.41}\n",
-      "{'loss': 0.8412, 'grad_norm': 6.708963394165039, 'learning_rate': 2.737616680113758e-06, 'epoch': 5.43}\n",
-      "{'loss': 0.8511, 'grad_norm': 6.676459312438965, 'learning_rate': 2.570647876948895e-06, 'epoch': 5.44}\n",
-      "{'loss': 0.9549, 'grad_norm': 7.339512825012207, 'learning_rate': 2.408798006933882e-06, 'epoch': 5.46}\n",
-      "{'loss': 1.1274, 'grad_norm': 6.678201198577881, 'learning_rate': 2.252084538126542e-06, 'epoch': 5.48}\n",
-      "{'loss': 0.9263, 'grad_norm': 6.124770641326904, 'learning_rate': 2.100524384225555e-06, 'epoch': 5.5}\n",
-      "{'loss': 0.943, 'grad_norm': 7.016269207000732, 'learning_rate': 1.9541339027450256e-06, 'epoch': 5.52}\n",
-      "{'loss': 0.9571, 'grad_norm': 5.896731853485107, 'learning_rate': 1.8129288932490274e-06, 'epoch': 5.53}\n",
-      "{'loss': 0.8802, 'grad_norm': 5.532138347625732, 'learning_rate': 1.6769245956464396e-06, 'epoch': 5.55}\n",
-      "{'loss': 1.0613, 'grad_norm': 6.437876224517822, 'learning_rate': 1.5461356885461075e-06, 'epoch': 5.57}\n",
-      "{'loss': 0.957, 'grad_norm': 5.419349670410156, 'learning_rate': 1.4205762876726092e-06, 'epoch': 5.59}\n",
-      "{'loss': 1.0672, 'grad_norm': 6.222854137420654, 'learning_rate': 1.3002599443428243e-06, 'epoch': 5.6}\n",
-      "{'loss': 1.0228, 'grad_norm': 6.305788993835449, 'learning_rate': 1.1851996440033319e-06, 'epoch': 5.62}\n",
-      "{'loss': 0.8494, 'grad_norm': 6.640852928161621, 'learning_rate': 1.0754078048289374e-06, 'epoch': 5.64}\n",
-      "{'loss': 0.9589, 'grad_norm': 5.630051612854004, 'learning_rate': 9.708962763824048e-07, 'epoch': 5.66}\n",
-      "{'loss': 0.9514, 'grad_norm': 5.754588603973389, 'learning_rate': 8.716763383355864e-07, 'epoch': 5.68}\n",
-      "{'loss': 0.9896, 'grad_norm': 6.073591232299805, 'learning_rate': 7.777586992519959e-07, 'epoch': 5.69}\n",
-      "{'loss': 0.8798, 'grad_norm': 6.883085250854492, 'learning_rate': 6.891534954310885e-07, 'epoch': 5.71}\n",
-      "{'loss': 0.9749, 'grad_norm': 5.874994277954102, 'learning_rate': 6.058702898142643e-07, 'epoch': 5.73}\n",
-      "{'loss': 0.862, 'grad_norm': 5.205725193023682, 'learning_rate': 5.279180709527765e-07, 'epoch': 5.75}\n",
-      "{'loss': 1.0397, 'grad_norm': 6.112522602081299, 'learning_rate': 4.553052520375911e-07, 'epoch': 5.77}\n",
-      "{'loss': 0.8691, 'grad_norm': 6.450985431671143, 'learning_rate': 3.8803966999139684e-07, 'epoch': 5.78}\n",
-      "{'loss': 0.884, 'grad_norm': 5.139239311218262, 'learning_rate': 3.261285846227868e-07, 'epoch': 5.8}\n",
-      "{'loss': 0.8508, 'grad_norm': 6.213397979736328, 'learning_rate': 2.6957867784270787e-07, 'epoch': 5.82}\n",
-      "{'loss': 0.8554, 'grad_norm': 27.320371627807617, 'learning_rate': 2.1839605294330933e-07, 'epoch': 5.84}\n",
-      "{'loss': 1.036, 'grad_norm': 7.248013973236084, 'learning_rate': 1.725862339392259e-07, 'epoch': 5.85}\n",
-      "{'loss': 0.8262, 'grad_norm': 7.534704685211182, 'learning_rate': 1.3215416497138754e-07, 'epoch': 5.87}\n",
-      "{'loss': 1.0454, 'grad_norm': 5.765580654144287, 'learning_rate': 9.710420977340762e-08, 'epoch': 5.89}\n",
-      "{'loss': 0.8484, 'grad_norm': 5.267190456390381, 'learning_rate': 6.744015120061509e-08, 'epoch': 5.91}\n",
-      "{'loss': 0.9818, 'grad_norm': 6.66579008102417, 'learning_rate': 4.316519082179227e-08, 'epoch': 5.93}\n",
-      "{'loss': 0.8825, 'grad_norm': 4.743204593658447, 'learning_rate': 2.4281948573617874e-08, 'epoch': 5.94}\n",
-      "{'loss': 0.9975, 'grad_norm': 6.015940189361572, 'learning_rate': 1.0792462477909882e-08, 'epoch': 5.96}\n",
-      "{'loss': 0.9418, 'grad_norm': 5.236660957336426, 'learning_rate': 2.6981884216847884e-09, 'epoch': 5.98}\n",
-      "{'loss': 0.9678, 'grad_norm': 5.222324371337891, 'learning_rate': 0.0, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [1:12:30<00:00,  1.25s/it][INFO|trainer.py:3788] 2024-07-04 12:22:39,963 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 12:22:39,963 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 12:22:39,964 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:00<00:01, 33.59it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:00<00:01, 28.40it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:00<00:01, 27.40it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:00<00:01, 27.09it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:00<00:01, 26.45it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:00<00:01, 25.97it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:00<00:00, 24.53it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:00<00:00, 25.01it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:01<00:00, 24.12it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:01<00:00, 24.19it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:01<00:00, 24.74it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:01<00:00, 25.42it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████���███▎    | 41/46 [00:01<00:00, 25.84it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.547395706176758, 'eval_runtime': 1.8294, 'eval_samples_per_second': 25.145, 'eval_steps_per_second': 25.145, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [1:12:32<00:00,  1.25s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:01<00:00, 26.08it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 12:22:41,795 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-3360\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 12:22:42,459 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 12:22:42,460 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 12:22:42,487 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/checkpoint-3360/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 12:22:42,487 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/checkpoint-3360/special_tokens_map.json\n",
-      "[INFO|trainer.py:2383] 2024-07-04 12:22:42,628 >> \n",
-      "\n",
-      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
-      "\n",
-      "\n",
-      "{'train_runtime': 4358.4327, 'train_samples_per_second': 6.17, 'train_steps_per_second': 0.771, 'train_loss': 1.4797242326395852, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [1:12:33<00:00,  1.30s/it]\n",
-      "[INFO|trainer.py:3478] 2024-07-04 12:22:42,631 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 12:22:43,255 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 12:22:43,256 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 12:22:43,285 >> tokenizer config file saved in saves/qwen2-0.5b/lora/sft/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 12:22:43,285 >> Special tokens file saved in saves/qwen2-0.5b/lora/sft/special_tokens_map.json\n",
-      "***** train metrics *****\n",
-      "  epoch                    =     5.9973\n",
-      "  total_flos               =  4594110GF\n",
-      "  train_loss               =     1.4797\n",
-      "  train_runtime            = 1:12:38.43\n",
-      "  train_samples_per_second =       6.17\n",
-      "  train_steps_per_second   =      0.771\n",
-      "Figure saved at: saves/qwen2-0.5b/lora/sft/training_loss.png\n",
-      "Figure saved at: saves/qwen2-0.5b/lora/sft/training_eval_loss.png\n",
-      "[INFO|trainer.py:3788] 2024-07-04 12:22:43,568 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 12:22:43,568 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 12:22:43,568 >>   Batch size = 1\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:01<00:00, 25.60it/s]\n",
-      "***** eval metrics *****\n",
-      "  epoch                   =     5.9973\n",
-      "  eval_loss               =     2.5474\n",
-      "  eval_runtime            = 0:00:01.84\n",
-      "  eval_samples_per_second =     24.959\n",
-      "  eval_steps_per_second   =     24.959\n",
-      "[INFO|modelcard.py:449] 2024-07-04 12:22:45,413 >> Dropping the following result as it does not have all the necessary fields:\n",
-      "{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: | 0.085 MB of 0.085 MB uploaded\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               eval/loss ▂▁▂▄▇██\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:            eval/runtime ▇█▃▁▇▄▅\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: eval/samples_per_second ▂▁▆█▂▄▄\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:   eval/steps_per_second ▂▁▆█▂▄▄\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:             train/epoch ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:       train/global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:         train/grad_norm ▄▁▁▃▂▃▃▃▃▄▃▄▂▃▄▆▃▂▄▄▅▆▄▆▅▅▃▄█▅▆█▆▆▅▅▆▇▇▅\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:     train/learning_rate ▂▄▅▇██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/loss █▇▇▇▆▆▆▆▆▆▅▅▆▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:                eval/loss 2.5474\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:             eval/runtime 1.843\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:  eval/samples_per_second 24.959\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:    eval/steps_per_second 24.959\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               total_flos 4932888177414144.0\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/epoch 5.99732\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:        train/global_step 3360\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:          train/grad_norm 5.22232\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:      train/learning_rate 0.0\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               train/loss 0.9678\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               train_loss 1.47972\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:            train_runtime 4358.4327\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: train_samples_per_second 6.17\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:   train_steps_per_second 0.771\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mqwen2_0.5b_lora_sft\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface/runs/u8sqhi0x\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at: \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 6 W&B file(s), 0 media file(s), 1 artifact file(s) and 0 other file(s)\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240704_111005-u8sqhi0x/logs\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The new W&B backend becomes opt-out in version 0.18.0; try it out with `wandb.require(\"core\")`! See https://wandb.me/wandb-core for more information.\n",
-      "CPU times: user 59.8 s, sys: 18.1 s, total: 1min 17s\n",
-      "Wall time: 1h 13min 51s\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "\n",
-    "!./scripts/tune-lf.sh config/qwen2_0.5b_lora_sft.yaml"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "loading /home/inflaton/code/projects/courses/llm-finetuning/llm_toolkit/translation_engine.py\n",
-      "Current Directory:\n",
-      "/home/inflaton/code/projects/courses/llm-finetuning/llama-factory\n",
-      "07/04/2024 12:22:59 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 12:23:00,122 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/vocab.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 12:23:00,122 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/merges.txt\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 12:23:00,122 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/tokenizer.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 12:23:00,122 >> loading file added_tokens.json from cache at None\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 12:23:00,122 >> loading file special_tokens_map.json from cache at None\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 12:23:00,122 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/tokenizer_config.json\n",
-      "[WARNING|logging.py:313] 2024-07-04 12:23:00,234 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-      "07/04/2024 12:23:00 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>\n",
-      "07/04/2024 12:23:00 - INFO - llamafactory.data.template - Add <|im_start|> to stop words.\n",
-      "07/04/2024 12:23:00 - INFO - llamafactory.data.loader - Loading dataset alpaca_mac.json...\n",
-      "Converting format of dataset (num_proc=16): 100%|█| 4528/4528 [00:00<00:00, 1573\n",
-      "Running tokenizer on dataset (num_proc=16): 100%|█| 4528/4528 [00:01<00:00, 3491\n",
-      "input_ids:\n",
-      "[151644, 872, 198, 5501, 14683, 279, 2701, 8453, 1467, 1119, 6364, 323, 3410, 1172, 279, 24531, 2213, 11, 4302, 770, 624, 35987, 102895, 99164, 100324, 100717, 100095, 99509, 1773, 151645, 198, 151644, 77091, 198, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n",
-      "inputs:\n",
-      "<|im_start|>user\n",
-      "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n",
-      "全仗着狐仙搭救。<|im_end|>\n",
-      "<|im_start|>assistant\n",
-      "Because I was protected by a fox fairy.<|im_end|>\n",
-      "label_ids:\n",
-      "[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n",
-      "labels:\n",
-      "Because I was protected by a fox fairy.<|im_end|>\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 12:23:03,981 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 12:23:03,982 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-1.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|modeling_utils.py:3556] 2024-07-04 12:23:04,016 >> loading weights file model.safetensors from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/model.safetensors\n",
-      "[INFO|modeling_utils.py:1531] 2024-07-04 12:23:06,701 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.\n",
-      "[INFO|configuration_utils.py:1000] 2024-07-04 12:23:06,704 >> Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645\n",
-      "}\n",
-      "\n",
-      "[INFO|modeling_utils.py:4364] 2024-07-04 12:26:42,040 >> All model checkpoint weights were used when initializing Qwen2ForCausalLM.\n",
-      "\n",
-      "[INFO|modeling_utils.py:4372] 2024-07-04 12:26:42,040 >> All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at Qwen/Qwen2-1.5B-Instruct.\n",
-      "If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.\n",
-      "[INFO|configuration_utils.py:955] 2024-07-04 12:26:42,765 >> loading configuration file generation_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/generation_config.json\n",
-      "[INFO|configuration_utils.py:1000] 2024-07-04 12:26:42,766 >> Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"do_sample\": true,\n",
-      "  \"eos_token_id\": [\n",
-      "    151645,\n",
-      "    151643\n",
-      "  ],\n",
-      "  \"pad_token_id\": 151643,\n",
-      "  \"repetition_penalty\": 1.1,\n",
-      "  \"temperature\": 0.7,\n",
-      "  \"top_k\": 20,\n",
-      "  \"top_p\": 0.8\n",
-      "}\n",
-      "\n",
-      "07/04/2024 12:26:43 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.\n",
-      "07/04/2024 12:26:43 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.\n",
-      "07/04/2024 12:26:43 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.\n",
-      "07/04/2024 12:26:43 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA\n",
-      "07/04/2024 12:26:43 - INFO - llamafactory.model.model_utils.misc - Found linear modules: q_proj,gate_proj,down_proj,k_proj,v_proj,up_proj,o_proj\n",
-      "07/04/2024 12:26:43 - INFO - llamafactory.model.loader - trainable params: 9,232,384 || all params: 1,552,946,688 || trainable%: 0.5945\n",
-      "[INFO|trainer.py:642] 2024-07-04 12:26:43,511 >> Using auto half precision backend\n",
-      "[INFO|trainer.py:2128] 2024-07-04 12:26:43,666 >> ***** Running training *****\n",
-      "[INFO|trainer.py:2129] 2024-07-04 12:26:43,666 >>   Num examples = 4,482\n",
-      "[INFO|trainer.py:2130] 2024-07-04 12:26:43,666 >>   Num Epochs = 6\n",
-      "[INFO|trainer.py:2131] 2024-07-04 12:26:43,666 >>   Instantaneous batch size per device = 1\n",
-      "[INFO|trainer.py:2134] 2024-07-04 12:26:43,666 >>   Total train batch size (w. parallel, distributed & accumulation) = 8\n",
-      "[INFO|trainer.py:2135] 2024-07-04 12:26:43,666 >>   Gradient Accumulation steps = 8\n",
-      "[INFO|trainer.py:2136] 2024-07-04 12:26:43,666 >>   Total optimization steps = 3,360\n",
-      "[INFO|trainer.py:2137] 2024-07-04 12:26:43,668 >>   Number of trainable parameters = 9,232,384\n",
-      "[INFO|integration_utils.py:750] 2024-07-04 12:26:43,670 >> Automatic Weights & Biases logging enabled, to disable set os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33minflaton-sg\u001b[0m (\u001b[33minflaton-ai\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.17.4\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m/home/inflaton/code/projects/courses/llm-finetuning/llama-factory/wandb/run-20240704_122645-mpc5sxtf\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mqwen2_1.5b_lora_sft\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface/runs/mpc5sxtf\u001b[0m\n",
-      "{'loss': 2.1612, 'grad_norm': 1.7288845777511597, 'learning_rate': 2.9761904761904763e-06, 'epoch': 0.02}\n",
-      "{'loss': 2.2871, 'grad_norm': 1.9337925910949707, 'learning_rate': 5.9523809523809525e-06, 'epoch': 0.04}\n",
-      "{'loss': 2.1455, 'grad_norm': 1.5129448175430298, 'learning_rate': 8.92857142857143e-06, 'epoch': 0.05}\n",
-      "{'loss': 2.1376, 'grad_norm': 2.9766852855682373, 'learning_rate': 1.1904761904761905e-05, 'epoch': 0.07}\n",
-      "{'loss': 2.2937, 'grad_norm': 1.413576602935791, 'learning_rate': 1.4880952380952381e-05, 'epoch': 0.09}\n",
-      "{'loss': 2.0076, 'grad_norm': 1.7012724876403809, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.11}\n",
-      "{'loss': 2.1399, 'grad_norm': 1.679208517074585, 'learning_rate': 2.0833333333333336e-05, 'epoch': 0.12}\n",
-      "{'loss': 1.9036, 'grad_norm': 1.6296344995498657, 'learning_rate': 2.380952380952381e-05, 'epoch': 0.14}\n",
-      "{'loss': 2.0186, 'grad_norm': 2.1293675899505615, 'learning_rate': 2.6785714285714288e-05, 'epoch': 0.16}\n",
-      "{'loss': 1.9517, 'grad_norm': 1.4419277906417847, 'learning_rate': 2.9761904761904762e-05, 'epoch': 0.18}\n",
-      "{'loss': 1.979, 'grad_norm': 1.8672434091567993, 'learning_rate': 3.273809523809524e-05, 'epoch': 0.2}\n",
-      "{'loss': 1.9362, 'grad_norm': 1.3589439392089844, 'learning_rate': 3.571428571428572e-05, 'epoch': 0.21}\n",
-      "{'loss': 1.9264, 'grad_norm': 1.71873140335083, 'learning_rate': 3.8690476190476195e-05, 'epoch': 0.23}\n",
-      "{'loss': 1.9515, 'grad_norm': 2.2398152351379395, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.25}\n",
-      "{'loss': 1.8163, 'grad_norm': 1.5651923418045044, 'learning_rate': 4.464285714285715e-05, 'epoch': 0.27}\n",
-      "{'loss': 1.792, 'grad_norm': 1.5333657264709473, 'learning_rate': 4.761904761904762e-05, 'epoch': 0.29}\n",
-      "{'loss': 1.9457, 'grad_norm': 1.448676347732544, 'learning_rate': 5.05952380952381e-05, 'epoch': 0.3}\n",
-      "{'loss': 1.7585, 'grad_norm': 1.8496005535125732, 'learning_rate': 5.3571428571428575e-05, 'epoch': 0.32}\n",
-      "{'loss': 1.8682, 'grad_norm': 2.332167387008667, 'learning_rate': 5.6547619047619046e-05, 'epoch': 0.34}\n",
-      "{'loss': 1.9775, 'grad_norm': 1.9075323343276978, 'learning_rate': 5.9523809523809524e-05, 'epoch': 0.36}\n",
-      "{'loss': 1.9233, 'grad_norm': 1.8132203817367554, 'learning_rate': 6.25e-05, 'epoch': 0.37}\n",
-      "{'loss': 1.8469, 'grad_norm': 2.514983892440796, 'learning_rate': 6.547619047619048e-05, 'epoch': 0.39}\n",
-      "{'loss': 1.8242, 'grad_norm': 2.0344440937042236, 'learning_rate': 6.845238095238096e-05, 'epoch': 0.41}\n",
-      "{'loss': 1.9965, 'grad_norm': 2.310185194015503, 'learning_rate': 7.142857142857143e-05, 'epoch': 0.43}\n",
-      "{'loss': 1.9004, 'grad_norm': 2.3513343334198, 'learning_rate': 7.440476190476191e-05, 'epoch': 0.45}\n",
-      "{'loss': 1.8188, 'grad_norm': 2.2934393882751465, 'learning_rate': 7.738095238095239e-05, 'epoch': 0.46}\n",
-      "{'loss': 1.8803, 'grad_norm': 2.8724184036254883, 'learning_rate': 8.035714285714287e-05, 'epoch': 0.48}\n",
-      "{'loss': 1.9181, 'grad_norm': 2.4238462448120117, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.5}\n",
-      "{'loss': 1.6932, 'grad_norm': 1.5286414623260498, 'learning_rate': 8.630952380952382e-05, 'epoch': 0.52}\n",
-      "{'loss': 1.8331, 'grad_norm': 2.563647985458374, 'learning_rate': 8.92857142857143e-05, 'epoch': 0.54}\n",
-      "{'loss': 1.8539, 'grad_norm': 2.127699613571167, 'learning_rate': 9.226190476190478e-05, 'epoch': 0.55}\n",
-      "{'loss': 1.6796, 'grad_norm': 3.4179396629333496, 'learning_rate': 9.523809523809524e-05, 'epoch': 0.57}\n",
-      "{'loss': 1.7209, 'grad_norm': 2.492151975631714, 'learning_rate': 9.821428571428572e-05, 'epoch': 0.59}\n",
-      "{'loss': 1.7723, 'grad_norm': 2.3568859100341797, 'learning_rate': 9.999956828659095e-05, 'epoch': 0.61}\n",
-      "{'loss': 1.7839, 'grad_norm': 3.5560832023620605, 'learning_rate': 9.999471159635539e-05, 'epoch': 0.62}\n",
-      "{'loss': 1.7146, 'grad_norm': 1.712493658065796, 'learning_rate': 9.998445910004082e-05, 'epoch': 0.64}\n",
-      "{'loss': 1.8911, 'grad_norm': 2.824240207672119, 'learning_rate': 9.996881190417393e-05, 'epoch': 0.66}\n",
-      "{'loss': 1.8631, 'grad_norm': 2.2122113704681396, 'learning_rate': 9.994777169751806e-05, 'epoch': 0.68}\n",
-      "{'loss': 1.5738, 'grad_norm': 1.7466025352478027, 'learning_rate': 9.992134075089084e-05, 'epoch': 0.7}\n",
-      "{'loss': 1.7552, 'grad_norm': 2.581709623336792, 'learning_rate': 9.988952191691925e-05, 'epoch': 0.71}\n",
-      "{'loss': 1.808, 'grad_norm': 2.1387972831726074, 'learning_rate': 9.985231862973168e-05, 'epoch': 0.73}\n",
-      "{'loss': 1.8303, 'grad_norm': 1.7675608396530151, 'learning_rate': 9.980973490458728e-05, 'epoch': 0.75}\n",
-      "{'loss': 1.8013, 'grad_norm': 2.706218719482422, 'learning_rate': 9.976177533744261e-05, 'epoch': 0.77}\n",
-      "{'loss': 1.7443, 'grad_norm': 2.9387295246124268, 'learning_rate': 9.97084451044556e-05, 'epoch': 0.79}\n",
-      "{'loss': 1.6509, 'grad_norm': 1.6503076553344727, 'learning_rate': 9.964974996142698e-05, 'epoch': 0.8}\n",
-      "{'loss': 1.722, 'grad_norm': 2.0305140018463135, 'learning_rate': 9.958569624317893e-05, 'epoch': 0.82}\n",
-      "{'loss': 1.7625, 'grad_norm': 2.8122429847717285, 'learning_rate': 9.951629086287151e-05, 'epoch': 0.84}\n",
-      "{'loss': 1.7194, 'grad_norm': 2.0110862255096436, 'learning_rate': 9.944154131125642e-05, 'epoch': 0.86}\n",
-      "{'loss': 1.7894, 'grad_norm': 1.7363322973251343, 'learning_rate': 9.936145565586871e-05, 'epoch': 0.87}\n",
-      "{'loss': 1.9447, 'grad_norm': 1.8065259456634521, 'learning_rate': 9.927604254015585e-05, 'epoch': 0.89}\n",
-      "{'loss': 1.8639, 'grad_norm': 1.8963510990142822, 'learning_rate': 9.918531118254507e-05, 'epoch': 0.91}\n",
-      "{'loss': 1.7336, 'grad_norm': 2.30542254447937, 'learning_rate': 9.90892713754483e-05, 'epoch': 0.93}\n",
-      "{'loss': 1.7705, 'grad_norm': 2.9846692085266113, 'learning_rate': 9.898793348420536e-05, 'epoch': 0.95}\n",
-      "{'loss': 1.7884, 'grad_norm': 2.1550045013427734, 'learning_rate': 9.888130844596524e-05, 'epoch': 0.96}\n",
-      "{'loss': 1.7428, 'grad_norm': 2.1323790550231934, 'learning_rate': 9.876940776850569e-05, 'epoch': 0.98}\n",
-      "{'loss': 1.7183, 'grad_norm': 1.8198726177215576, 'learning_rate': 9.865224352899119e-05, 'epoch': 1.0}\n",
-      " 17%|██████▎                               | 560/3360 [15:31<1:20:24,  1.72s/it][INFO|trainer.py:3788] 2024-07-04 12:42:20,584 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 12:42:20,584 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 12:42:20,585 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:01, 27.33it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:01, 20.64it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:00<00:01, 20.40it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:00<00:01, 20.19it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:00<00:01, 19.61it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:00<00:01, 19.78it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:01<00:01, 18.45it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:01<00:01, 19.00it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:01<00:01, 19.55it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:01<00:00, 19.15it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:01<00:00, 18.70it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:01<00:00, 18.19it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:01<00:00, 18.76it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:01<00:00, 18.98it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:02<00:00, 18.88it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:02<00:00, 19.56it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 1.7211226224899292, 'eval_runtime': 2.4286, 'eval_samples_per_second': 18.941, 'eval_steps_per_second': 18.941, 'epoch': 1.0}\n",
-      " 17%|██████▎                               | 560/3360 [15:34<1:20:24,  1.72s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:02<00:00, 19.47it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 12:42:23,015 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-560\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 12:42:23,808 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 12:42:23,809 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 12:42:23,882 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-560/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 12:42:23,883 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-560/special_tokens_map.json\n",
-      "{'loss': 1.6305, 'grad_norm': 1.8726240396499634, 'learning_rate': 9.852982837266955e-05, 'epoch': 1.02}\n",
-      "{'loss': 1.4509, 'grad_norm': 2.8097503185272217, 'learning_rate': 9.840217551150706e-05, 'epoch': 1.04}\n",
-      "{'loss': 1.6345, 'grad_norm': 2.0100064277648926, 'learning_rate': 9.826929872276255e-05, 'epoch': 1.05}\n",
-      "{'loss': 1.5736, 'grad_norm': 2.456465482711792, 'learning_rate': 9.81312123475006e-05, 'epoch': 1.07}\n",
-      "{'loss': 1.5363, 'grad_norm': 2.7739548683166504, 'learning_rate': 9.798793128904356e-05, 'epoch': 1.09}\n",
-      "{'loss': 1.5754, 'grad_norm': 2.8599655628204346, 'learning_rate': 9.78394710113631e-05, 'epoch': 1.11}\n",
-      "{'loss': 1.5728, 'grad_norm': 2.2901456356048584, 'learning_rate': 9.768584753741134e-05, 'epoch': 1.12}\n",
-      "{'loss': 1.5632, 'grad_norm': 3.6802914142608643, 'learning_rate': 9.752707744739145e-05, 'epoch': 1.14}\n",
-      "{'loss': 1.5927, 'grad_norm': 2.5885791778564453, 'learning_rate': 9.736317787696816e-05, 'epoch': 1.16}\n",
-      "{'loss': 1.4571, 'grad_norm': 2.383814573287964, 'learning_rate': 9.719416651541839e-05, 'epoch': 1.18}\n",
-      "{'loss': 1.424, 'grad_norm': 2.032453775405884, 'learning_rate': 9.702006160372209e-05, 'epoch': 1.2}\n",
-      "{'loss': 1.5577, 'grad_norm': 2.8879408836364746, 'learning_rate': 9.684088193259355e-05, 'epoch': 1.21}\n",
-      "{'loss': 1.5083, 'grad_norm': 2.9004592895507812, 'learning_rate': 9.665664684045333e-05, 'epoch': 1.23}\n",
-      "{'loss': 1.5696, 'grad_norm': 3.4651644229888916, 'learning_rate': 9.646737621134112e-05, 'epoch': 1.25}\n",
-      "{'loss': 1.542, 'grad_norm': 3.6657605171203613, 'learning_rate': 9.627309047276974e-05, 'epoch': 1.27}\n",
-      "{'loss': 1.6975, 'grad_norm': 3.4882619380950928, 'learning_rate': 9.607381059352038e-05, 'epoch': 1.29}\n",
-      "{'loss': 1.6179, 'grad_norm': 2.73240327835083, 'learning_rate': 9.586955808137958e-05, 'epoch': 1.3}\n",
-      "{'loss': 1.6236, 'grad_norm': 2.60489559173584, 'learning_rate': 9.566035498081784e-05, 'epoch': 1.32}\n",
-      "{'loss': 1.5901, 'grad_norm': 3.45670223236084, 'learning_rate': 9.544622387061055e-05, 'epoch': 1.34}\n",
-      "{'loss': 1.3816, 'grad_norm': 3.3906328678131104, 'learning_rate': 9.522718786140097e-05, 'epoch': 1.36}\n",
-      "{'loss': 1.6149, 'grad_norm': 3.6723110675811768, 'learning_rate': 9.500327059320606e-05, 'epoch': 1.37}\n",
-      "{'loss': 1.4588, 'grad_norm': 4.5224103927612305, 'learning_rate': 9.477449623286505e-05, 'epoch': 1.39}\n",
-      "{'loss': 1.3431, 'grad_norm': 2.5576796531677246, 'learning_rate': 9.454088947143116e-05, 'epoch': 1.41}\n",
-      "{'loss': 1.6278, 'grad_norm': 3.344188690185547, 'learning_rate': 9.430247552150673e-05, 'epoch': 1.43}\n",
-      "{'loss': 1.5137, 'grad_norm': 3.4474005699157715, 'learning_rate': 9.405928011452211e-05, 'epoch': 1.45}\n",
-      "{'loss': 1.4911, 'grad_norm': 2.6104114055633545, 'learning_rate': 9.381132949795861e-05, 'epoch': 1.46}\n",
-      "{'loss': 1.6567, 'grad_norm': 3.090139150619507, 'learning_rate': 9.35586504325155e-05, 'epoch': 1.48}\n",
-      "{'loss': 1.5008, 'grad_norm': 3.6463866233825684, 'learning_rate': 9.330127018922194e-05, 'epoch': 1.5}\n",
-      "{'loss': 1.4248, 'grad_norm': 2.3963379859924316, 'learning_rate': 9.303921654649362e-05, 'epoch': 1.52}\n",
-      "{'loss': 1.6043, 'grad_norm': 3.4818763732910156, 'learning_rate': 9.277251778713474e-05, 'epoch': 1.54}\n",
-      "{'loss': 1.5517, 'grad_norm': 3.180640697479248, 'learning_rate': 9.250120269528546e-05, 'epoch': 1.55}\n",
-      "{'loss': 1.4711, 'grad_norm': 2.7267000675201416, 'learning_rate': 9.22253005533154e-05, 'epoch': 1.57}\n",
-      "{'loss': 1.5511, 'grad_norm': 3.386282444000244, 'learning_rate': 9.194484113866313e-05, 'epoch': 1.59}\n",
-      "{'loss': 1.6975, 'grad_norm': 2.707632064819336, 'learning_rate': 9.165985472062246e-05, 'epoch': 1.61}\n",
-      "{'loss': 1.4396, 'grad_norm': 2.970285177230835, 'learning_rate': 9.137037205707552e-05, 'epoch': 1.62}\n",
-      "{'loss': 1.5347, 'grad_norm': 2.7082931995391846, 'learning_rate': 9.107642439117321e-05, 'epoch': 1.64}\n",
-      "{'loss': 1.5446, 'grad_norm': 2.947016716003418, 'learning_rate': 9.077804344796302e-05, 'epoch': 1.66}\n",
-      "{'loss': 1.5401, 'grad_norm': 2.4926042556762695, 'learning_rate': 9.04752614309652e-05, 'epoch': 1.68}\n",
-      "{'loss': 1.479, 'grad_norm': 3.50626802444458, 'learning_rate': 9.01681110186971e-05, 'epoch': 1.7}\n",
-      "{'loss': 1.5107, 'grad_norm': 4.556169509887695, 'learning_rate': 8.985662536114613e-05, 'epoch': 1.71}\n",
-      "{'loss': 1.473, 'grad_norm': 2.4575538635253906, 'learning_rate': 8.954083807619208e-05, 'epoch': 1.73}\n",
-      "{'loss': 1.6125, 'grad_norm': 3.063415765762329, 'learning_rate': 8.922078324597879e-05, 'epoch': 1.75}\n",
-      "{'loss': 1.5893, 'grad_norm': 2.45483660697937, 'learning_rate': 8.889649541323574e-05, 'epoch': 1.77}\n",
-      "{'loss': 1.4993, 'grad_norm': 3.031142473220825, 'learning_rate': 8.856800957755e-05, 'epoch': 1.78}\n",
-      "{'loss': 1.5025, 'grad_norm': 2.9005496501922607, 'learning_rate': 8.823536119158864e-05, 'epoch': 1.8}\n",
-      "{'loss': 1.4725, 'grad_norm': 2.9155054092407227, 'learning_rate': 8.789858615727265e-05, 'epoch': 1.82}\n",
-      "{'loss': 1.4313, 'grad_norm': 2.5998966693878174, 'learning_rate': 8.755772082190194e-05, 'epoch': 1.84}\n",
-      "{'loss': 1.5647, 'grad_norm': 2.5580039024353027, 'learning_rate': 8.721280197423258e-05, 'epoch': 1.86}\n",
-      "{'loss': 1.4349, 'grad_norm': 3.395029067993164, 'learning_rate': 8.68638668405062e-05, 'epoch': 1.87}\n",
-      "{'loss': 1.5214, 'grad_norm': 2.8961341381073, 'learning_rate': 8.651095308043232e-05, 'epoch': 1.89}\n",
-      "{'loss': 1.6206, 'grad_norm': 3.4450645446777344, 'learning_rate': 8.61540987831238e-05, 'epoch': 1.91}\n",
-      "{'loss': 1.6429, 'grad_norm': 3.4198362827301025, 'learning_rate': 8.579334246298593e-05, 'epoch': 1.93}\n",
-      "{'loss': 1.5473, 'grad_norm': 2.9955196380615234, 'learning_rate': 8.542872305555978e-05, 'epoch': 1.95}\n",
-      "{'loss': 1.4405, 'grad_norm': 2.7997260093688965, 'learning_rate': 8.50602799133199e-05, 'epoch': 1.96}\n",
-      "{'loss': 1.5382, 'grad_norm': 2.4689786434173584, 'learning_rate': 8.468805280142709e-05, 'epoch': 1.98}\n",
-      "{'loss': 1.5378, 'grad_norm': 3.09759783744812, 'learning_rate': 8.43120818934367e-05, 'epoch': 2.0}\n",
-      " 33%|████████████▎                        | 1120/3360 [32:07<1:05:51,  1.76s/it][INFO|trainer.py:3788] 2024-07-04 12:58:56,606 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 12:58:56,606 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 12:58:56,606 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:01, 25.84it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:01, 21.96it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:00<00:01, 21.04it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:00<00:01, 19.14it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:00<00:01, 19.13it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:00<00:01, 18.60it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:00<00:01, 18.46it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:01<00:01, 18.88it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:01<00:01, 19.01it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:01<00:01, 17.92it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:01<00:01, 18.62it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:01<00:00, 18.49it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:01<00:00, 18.80it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:01<00:00, 18.48it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:01<00:00, 18.48it/s]\u001b[A\n",
-      " 80%|██████████████████████████████���███▌        | 37/46 [00:01<00:00, 18.86it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:02<00:00, 18.91it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:02<00:00, 18.85it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:02<00:00, 19.05it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 1.6970319747924805, 'eval_runtime': 2.4642, 'eval_samples_per_second': 18.668, 'eval_steps_per_second': 18.668, 'epoch': 2.0}\n",
-      " 33%|████████████▎                        | 1120/3360 [32:10<1:05:51,  1.76s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:02<00:00, 19.46it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 12:58:59,073 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-1120\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 12:58:59,895 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 12:58:59,896 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 12:58:59,945 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-1120/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 12:58:59,945 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-1120/special_tokens_map.json\n",
-      "{'loss': 1.2363, 'grad_norm': 3.1393024921417236, 'learning_rate': 8.393240776696274e-05, 'epoch': 2.02}\n",
-      "{'loss': 1.1161, 'grad_norm': 2.708930253982544, 'learning_rate': 8.354907139929851e-05, 'epoch': 2.03}\n",
-      "{'loss': 1.1975, 'grad_norm': 4.3620429039001465, 'learning_rate': 8.316211416299397e-05, 'epoch': 2.05}\n",
-      "{'loss': 1.1225, 'grad_norm': 3.3463101387023926, 'learning_rate': 8.27715778213905e-05, 'epoch': 2.07}\n",
-      "{'loss': 1.0548, 'grad_norm': 2.8970718383789062, 'learning_rate': 8.237750452411353e-05, 'epoch': 2.09}\n",
-      "{'loss': 1.1526, 'grad_norm': 2.99774432182312, 'learning_rate': 8.197993680252334e-05, 'epoch': 2.11}\n",
-      "{'loss': 1.3093, 'grad_norm': 4.249015808105469, 'learning_rate': 8.157891756512488e-05, 'epoch': 2.12}\n",
-      "{'loss': 1.1306, 'grad_norm': 3.889763593673706, 'learning_rate': 8.117449009293668e-05, 'epoch': 2.14}\n",
-      "{'loss': 1.1286, 'grad_norm': 4.101832866668701, 'learning_rate': 8.076669803481965e-05, 'epoch': 2.16}\n",
-      "{'loss': 1.1271, 'grad_norm': 4.3527703285217285, 'learning_rate': 8.035558540276618e-05, 'epoch': 2.18}\n",
-      "{'loss': 1.1593, 'grad_norm': 3.5413858890533447, 'learning_rate': 7.994119656715002e-05, 'epoch': 2.2}\n",
-      "{'loss': 1.1824, 'grad_norm': 5.094357490539551, 'learning_rate': 7.952357625193749e-05, 'epoch': 2.21}\n",
-      "{'loss': 1.2347, 'grad_norm': 5.2239089012146, 'learning_rate': 7.91027695298606e-05, 'epoch': 2.23}\n",
-      "{'loss': 1.2285, 'grad_norm': 5.532718658447266, 'learning_rate': 7.86788218175523e-05, 'epoch': 2.25}\n",
-      "{'loss': 1.1147, 'grad_norm': 3.8143270015716553, 'learning_rate': 7.8251778870645e-05, 'epoch': 2.27}\n",
-      "{'loss': 1.1478, 'grad_norm': 4.406189441680908, 'learning_rate': 7.782168677883206e-05, 'epoch': 2.28}\n",
-      "{'loss': 1.1846, 'grad_norm': 3.269481658935547, 'learning_rate': 7.738859196089358e-05, 'epoch': 2.3}\n",
-      "{'loss': 1.2015, 'grad_norm': 4.366032123565674, 'learning_rate': 7.695254115968648e-05, 'epoch': 2.32}\n",
-      "{'loss': 1.3038, 'grad_norm': 3.7871077060699463, 'learning_rate': 7.651358143709972e-05, 'epoch': 2.34}\n",
-      "{'loss': 1.2532, 'grad_norm': 3.805539846420288, 'learning_rate': 7.60717601689749e-05, 'epoch': 2.36}\n",
-      "{'loss': 1.2044, 'grad_norm': 4.302929401397705, 'learning_rate': 7.562712503999327e-05, 'epoch': 2.37}\n",
-      "{'loss': 1.3852, 'grad_norm': 4.319093227386475, 'learning_rate': 7.517972403852905e-05, 'epoch': 2.39}\n",
-      "{'loss': 1.2647, 'grad_norm': 3.8114326000213623, 'learning_rate': 7.472960545147038e-05, 'epoch': 2.41}\n",
-      "{'loss': 1.1138, 'grad_norm': 4.816274166107178, 'learning_rate': 7.427681785900761e-05, 'epoch': 2.43}\n",
-      "{'loss': 1.1797, 'grad_norm': 3.7659311294555664, 'learning_rate': 7.382141012939034e-05, 'epoch': 2.45}\n",
-      "{'loss': 1.1566, 'grad_norm': 3.777496337890625, 'learning_rate': 7.33634314136531e-05, 'epoch': 2.46}\n",
-      "{'loss': 1.2235, 'grad_norm': 3.779813051223755, 'learning_rate': 7.290293114031061e-05, 'epoch': 2.48}\n",
-      "{'loss': 1.3044, 'grad_norm': 4.243238925933838, 'learning_rate': 7.243995901002312e-05, 'epoch': 2.5}\n",
-      "{'loss': 1.1993, 'grad_norm': 3.7302756309509277, 'learning_rate': 7.197456499023225e-05, 'epoch': 2.52}\n",
-      "{'loss': 1.1955, 'grad_norm': 3.837207555770874, 'learning_rate': 7.150679930976825e-05, 'epoch': 2.53}\n",
-      "{'loss': 1.2282, 'grad_norm': 4.182308673858643, 'learning_rate': 7.103671245342887e-05, 'epoch': 2.55}\n",
-      "{'loss': 1.1068, 'grad_norm': 4.697420120239258, 'learning_rate': 7.056435515653059e-05, 'epoch': 2.57}\n",
-      "{'loss': 1.3001, 'grad_norm': 5.241019248962402, 'learning_rate': 7.008977839943299e-05, 'epoch': 2.59}\n",
-      "{'loss': 1.1734, 'grad_norm': 5.618649959564209, 'learning_rate': 6.961303340203653e-05, 'epoch': 2.61}\n",
-      "{'loss': 1.2205, 'grad_norm': 3.501143455505371, 'learning_rate': 6.91341716182545e-05, 'epoch': 2.62}\n",
-      "{'loss': 1.2196, 'grad_norm': 2.823162317276001, 'learning_rate': 6.86532447304597e-05, 'epoch': 2.64}\n",
-      "{'loss': 1.1884, 'grad_norm': 3.8134286403656006, 'learning_rate': 6.817030464390656e-05, 'epoch': 2.66}\n",
-      "{'loss': 1.296, 'grad_norm': 3.9806973934173584, 'learning_rate': 6.768540348112907e-05, 'epoch': 2.68}\n",
-      "{'loss': 1.0861, 'grad_norm': 5.336892604827881, 'learning_rate': 6.719859357631535e-05, 'epoch': 2.7}\n",
-      "{'loss': 1.1123, 'grad_norm': 5.413362503051758, 'learning_rate': 6.670992746965938e-05, 'epoch': 2.71}\n",
-      "{'loss': 1.3405, 'grad_norm': 3.942927122116089, 'learning_rate': 6.621945790169036e-05, 'epoch': 2.73}\n",
-      "{'loss': 1.2739, 'grad_norm': 3.9731507301330566, 'learning_rate': 6.572723780758069e-05, 'epoch': 2.75}\n",
-      "{'loss': 1.2215, 'grad_norm': 3.9058139324188232, 'learning_rate': 6.523332031143272e-05, 'epoch': 2.77}\n",
-      "{'loss': 1.231, 'grad_norm': 3.7157390117645264, 'learning_rate': 6.473775872054521e-05, 'epoch': 2.78}\n",
-      "{'loss': 1.0667, 'grad_norm': 3.2383055686950684, 'learning_rate': 6.424060651966007e-05, 'epoch': 2.8}\n",
-      "{'loss': 1.1742, 'grad_norm': 3.6972646713256836, 'learning_rate': 6.374191736518974e-05, 'epoch': 2.82}\n",
-      "{'loss': 1.2108, 'grad_norm': 3.783498764038086, 'learning_rate': 6.324174507942637e-05, 'epoch': 2.84}\n",
-      "{'loss': 1.1861, 'grad_norm': 4.8546037673950195, 'learning_rate': 6.274014364473274e-05, 'epoch': 2.86}\n",
-      "{'loss': 1.364, 'grad_norm': 3.1121954917907715, 'learning_rate': 6.22371671977162e-05, 'epoch': 2.87}\n",
-      "{'loss': 1.2202, 'grad_norm': 4.141942024230957, 'learning_rate': 6.173287002338577e-05, 'epoch': 2.89}\n",
-      "{'loss': 1.2125, 'grad_norm': 4.195278167724609, 'learning_rate': 6.122730654929334e-05, 'epoch': 2.91}\n",
-      "{'loss': 1.1392, 'grad_norm': 3.6065282821655273, 'learning_rate': 6.072053133965938e-05, 'epoch': 2.93}\n",
-      "{'loss': 1.3093, 'grad_norm': 3.8997342586517334, 'learning_rate': 6.021259908948402e-05, 'epoch': 2.95}\n",
-      "{'loss': 1.258, 'grad_norm': 4.212363243103027, 'learning_rate': 5.970356461864391e-05, 'epoch': 2.96}\n",
-      "{'loss': 1.1774, 'grad_norm': 4.735218524932861, 'learning_rate': 5.919348286597569e-05, 'epoch': 2.98}\n",
-      "{'loss': 1.2808, 'grad_norm': 3.88008713722229, 'learning_rate': 5.868240888334653e-05, 'epoch': 3.0}\n",
-      " 50%|███████████████████▌                   | 1680/3360 [48:42<49:15,  1.76s/it][INFO|trainer.py:3788] 2024-07-04 13:15:31,424 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 13:15:31,425 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 13:15:31,425 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:01, 29.32it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:01, 22.95it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:00<00:01, 21.48it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:00<00:01, 20.77it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:00<00:01, 20.79it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:00<00:01, 20.58it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:00<00:01, 20.58it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:01<00:01, 20.58it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:01<00:00, 20.54it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:01<00:00, 19.77it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:01<00:00, 19.59it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:01<00:00, 19.42it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:01<00:00, 18.92it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:01<00:00, 19.10it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:01<00:00, 18.95it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:02<00:00, 17.92it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:02<00:00, 18.37it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 1.8097562789916992, 'eval_runtime': 2.3741, 'eval_samples_per_second': 19.376, 'eval_steps_per_second': 19.376, 'epoch': 3.0}\n",
-      " 50%|███████████████████▌                   | 1680/3360 [48:45<49:15,  1.76s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:02<00:00, 18.65it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 13:15:33,801 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-1680\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 13:15:34,788 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 13:15:34,789 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 13:15:34,839 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-1680/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 13:15:34,839 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-1680/special_tokens_map.json\n",
-      "{'loss': 1.1006, 'grad_norm': 3.581298589706421, 'learning_rate': 5.8170397829712485e-05, 'epoch': 3.02}\n",
-      "{'loss': 0.7853, 'grad_norm': 4.149472713470459, 'learning_rate': 5.765750496516547e-05, 'epoch': 3.03}\n",
-      "{'loss': 0.9606, 'grad_norm': 4.768033027648926, 'learning_rate': 5.714378564496901e-05, 'epoch': 3.05}\n",
-      "{'loss': 0.8799, 'grad_norm': 3.7473530769348145, 'learning_rate': 5.6629295313583974e-05, 'epoch': 3.07}\n",
-      "{'loss': 0.8164, 'grad_norm': 3.66397762298584, 'learning_rate': 5.611408949868457e-05, 'epoch': 3.09}\n",
-      "{'loss': 0.8902, 'grad_norm': 5.061825275421143, 'learning_rate': 5.559822380516539e-05, 'epoch': 3.11}\n",
-      "{'loss': 0.8534, 'grad_norm': 4.06561803817749, 'learning_rate': 5.5081753909140096e-05, 'epoch': 3.12}\n",
-      "{'loss': 0.9668, 'grad_norm': 4.875536918640137, 'learning_rate': 5.456473555193242e-05, 'epoch': 3.14}\n",
-      "{'loss': 0.8607, 'grad_norm': 6.051300048828125, 'learning_rate': 5.404722453406017e-05, 'epoch': 3.16}\n",
-      "{'loss': 0.9096, 'grad_norm': 5.530092716217041, 'learning_rate': 5.3529276709212816e-05, 'epoch': 3.18}\n",
-      "{'loss': 0.9201, 'grad_norm': 6.524964809417725, 'learning_rate': 5.30109479782233e-05, 'epoch': 3.2}\n",
-      "{'loss': 0.856, 'grad_norm': 4.842297554016113, 'learning_rate': 5.249229428303486e-05, 'epoch': 3.21}\n",
-      "{'loss': 1.0534, 'grad_norm': 3.963986396789551, 'learning_rate': 5.197337160066331e-05, 'epoch': 3.23}\n",
-      "{'loss': 0.8642, 'grad_norm': 4.481607437133789, 'learning_rate': 5.145423593715557e-05, 'epoch': 3.25}\n",
-      "{'loss': 0.8856, 'grad_norm': 3.9990179538726807, 'learning_rate': 5.0934943321545115e-05, 'epoch': 3.27}\n",
-      "{'loss': 0.7925, 'grad_norm': 4.209486484527588, 'learning_rate': 5.041554979980486e-05, 'epoch': 3.28}\n",
-      "{'loss': 0.9874, 'grad_norm': 4.624832630157471, 'learning_rate': 4.9896111428798254e-05, 'epoch': 3.3}\n",
-      "{'loss': 0.9581, 'grad_norm': 5.0234785079956055, 'learning_rate': 4.9376684270229254e-05, 'epoch': 3.32}\n",
-      "{'loss': 0.9273, 'grad_norm': 4.156904220581055, 'learning_rate': 4.8857324384591653e-05, 'epoch': 3.34}\n",
-      "{'loss': 0.8929, 'grad_norm': 4.292726516723633, 'learning_rate': 4.8338087825118675e-05, 'epoch': 3.36}\n",
-      "{'loss': 0.9584, 'grad_norm': 5.206954002380371, 'learning_rate': 4.781903063173321e-05, 'epoch': 3.37}\n",
-      "{'loss': 0.8548, 'grad_norm': 4.075423717498779, 'learning_rate': 4.730020882499964e-05, 'epoch': 3.39}\n",
-      "{'loss': 0.9083, 'grad_norm': 5.6302008628845215, 'learning_rate': 4.678167840007767e-05, 'epoch': 3.41}\n",
-      "{'loss': 0.9967, 'grad_norm': 4.765602111816406, 'learning_rate': 4.626349532067879e-05, 'epoch': 3.43}\n",
-      "{'loss': 0.9439, 'grad_norm': 4.012918949127197, 'learning_rate': 4.574571551302647e-05, 'epoch': 3.44}\n",
-      "{'loss': 0.9758, 'grad_norm': 3.995499849319458, 'learning_rate': 4.522839485981994e-05, 'epoch': 3.46}\n",
-      "{'loss': 0.9056, 'grad_norm': 4.168616771697998, 'learning_rate': 4.471158919420312e-05, 'epoch': 3.48}\n",
-      "{'loss': 0.8866, 'grad_norm': 5.359450817108154, 'learning_rate': 4.4195354293738484e-05, 'epoch': 3.5}\n",
-      "{'loss': 0.8474, 'grad_norm': 5.709634304046631, 'learning_rate': 4.367974587438733e-05, 'epoch': 3.52}\n",
-      "{'loss': 0.9325, 'grad_norm': 6.928687572479248, 'learning_rate': 4.316481958449634e-05, 'epoch': 3.53}\n",
-      "{'loss': 0.8947, 'grad_norm': 9.245586395263672, 'learning_rate': 4.2650630998791615e-05, 'epoch': 3.55}\n",
-      "{'loss': 1.0068, 'grad_norm': 7.456272602081299, 'learning_rate': 4.213723561238074e-05, 'epoch': 3.57}\n",
-      "{'loss': 0.8202, 'grad_norm': 3.894721746444702, 'learning_rate': 4.162468883476319e-05, 'epoch': 3.59}\n",
-      "{'loss': 0.8858, 'grad_norm': 4.249356269836426, 'learning_rate': 4.111304598385018e-05, 'epoch': 3.61}\n",
-      "{'loss': 0.9275, 'grad_norm': 6.780489921569824, 'learning_rate': 4.060236227999441e-05, 'epoch': 3.62}\n",
-      "{'loss': 0.8648, 'grad_norm': 5.042501449584961, 'learning_rate': 4.0092692840030134e-05, 'epoch': 3.64}\n",
-      "{'loss': 0.8905, 'grad_norm': 4.697298526763916, 'learning_rate': 3.9584092671324606e-05, 'epoch': 3.66}\n",
-      "{'loss': 0.9487, 'grad_norm': 8.913374900817871, 'learning_rate': 3.907661666584131e-05, 'epoch': 3.68}\n",
-      "{'loss': 0.9711, 'grad_norm': 6.1330885887146, 'learning_rate': 3.857031959421553e-05, 'epoch': 3.69}\n",
-      "{'loss': 0.9569, 'grad_norm': 4.3145599365234375, 'learning_rate': 3.806525609984312e-05, 'epoch': 3.71}\n",
-      "{'loss': 0.8455, 'grad_norm': 5.199124813079834, 'learning_rate': 3.7561480692983006e-05, 'epoch': 3.73}\n",
-      "{'loss': 0.9282, 'grad_norm': 4.841589450836182, 'learning_rate': 3.705904774487396e-05, 'epoch': 3.75}\n",
-      "{'loss': 0.9613, 'grad_norm': 5.256564140319824, 'learning_rate': 3.655801148186655e-05, 'epoch': 3.77}\n",
-      "{'loss': 0.832, 'grad_norm': 4.737905025482178, 'learning_rate': 3.6058425979570485e-05, 'epoch': 3.78}\n",
-      "{'loss': 0.9194, 'grad_norm': 4.6155524253845215, 'learning_rate': 3.556034515701852e-05, 'epoch': 3.8}\n",
-      "{'loss': 0.9149, 'grad_norm': 5.484898090362549, 'learning_rate': 3.506382277084696e-05, 'epoch': 3.82}\n",
-      "{'loss': 0.9029, 'grad_norm': 4.691559314727783, 'learning_rate': 3.4568912409493945e-05, 'epoch': 3.84}\n",
-      "{'loss': 0.7487, 'grad_norm': 4.6990132331848145, 'learning_rate': 3.4075667487415785e-05, 'epoch': 3.86}\n",
-      "{'loss': 0.8389, 'grad_norm': 3.9886608123779297, 'learning_rate': 3.358414123932195e-05, 'epoch': 3.87}\n",
-      "{'loss': 0.9443, 'grad_norm': 3.797034978866577, 'learning_rate': 3.3094386714429724e-05, 'epoch': 3.89}\n",
-      "{'loss': 0.9102, 'grad_norm': 9.836748123168945, 'learning_rate': 3.2606456770738636e-05, 'epoch': 3.91}\n",
-      "{'loss': 0.8031, 'grad_norm': 6.517895221710205, 'learning_rate': 3.212040406932569e-05, 'epoch': 3.93}\n",
-      "{'loss': 0.7276, 'grad_norm': 3.757455825805664, 'learning_rate': 3.163628106866172e-05, 'epoch': 3.94}\n",
-      "{'loss': 1.0437, 'grad_norm': 5.128631591796875, 'learning_rate': 3.115414001894974e-05, 'epoch': 3.96}\n",
-      "{'loss': 0.9261, 'grad_norm': 4.2124457359313965, 'learning_rate': 3.067403295648566e-05, 'epoch': 3.98}\n",
-      "{'loss': 0.7864, 'grad_norm': 3.609720230102539, 'learning_rate': 3.019601169804216e-05, 'epoch': 4.0}\n",
-      " 67%|████████████████████████▋            | 2240/3360 [1:05:16<32:59,  1.77s/it][INFO|trainer.py:3788] 2024-07-04 13:32:05,670 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 13:32:05,670 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 13:32:05,670 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:01, 24.52it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:01, 20.90it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:00<00:01, 20.22it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:00<00:01, 19.67it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:00<00:01, 19.66it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:00<00:01, 18.52it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:00<00:01, 18.59it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:01<00:01, 18.81it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:01<00:01, 19.01it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:01<00:01, 18.89it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:01<00:01, 19.08it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:01<00:00, 19.48it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:01<00:00, 19.48it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:01<00:00, 19.42it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:01<00:00, 19.37it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:01<00:00, 18.44it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:02<00:00, 17.52it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:02<00:00, 17.11it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:02<00:00, 17.41it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 1.9859257936477661, 'eval_runtime': 2.5092, 'eval_samples_per_second': 18.332, 'eval_steps_per_second': 18.332, 'epoch': 4.0}\n",
-      " 67%|████████████████████████▋            | 2240/3360 [1:05:19<32:59,  1.77s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:02<00:00, 17.74it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 13:32:08,182 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-2240\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 13:32:08,839 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 13:32:08,839 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 13:32:08,917 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-2240/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 13:32:08,917 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-2240/special_tokens_map.json\n",
-      "{'loss': 0.7877, 'grad_norm': 3.5848188400268555, 'learning_rate': 2.9720127835276256e-05, 'epoch': 4.02}\n",
-      "{'loss': 0.6372, 'grad_norm': 4.3321661949157715, 'learning_rate': 2.9246432729161055e-05, 'epoch': 4.03}\n",
-      "{'loss': 0.7208, 'grad_norm': 5.079366207122803, 'learning_rate': 2.8774977504442647e-05, 'epoch': 4.05}\n",
-      "{'loss': 0.6386, 'grad_norm': 4.389534950256348, 'learning_rate': 2.8305813044122097e-05, 'epoch': 4.07}\n",
-      "{'loss': 0.6416, 'grad_norm': 3.866262197494507, 'learning_rate': 2.7838989983964065e-05, 'epoch': 4.09}\n",
-      "{'loss': 0.7771, 'grad_norm': 4.4248528480529785, 'learning_rate': 2.737455870703155e-05, 'epoch': 4.11}\n",
-      "{'loss': 0.5944, 'grad_norm': 5.1537370681762695, 'learning_rate': 2.6912569338248315e-05, 'epoch': 4.12}\n",
-      "{'loss': 0.6846, 'grad_norm': 4.410754203796387, 'learning_rate': 2.645307173898901e-05, 'epoch': 4.14}\n",
-      "{'loss': 0.7499, 'grad_norm': 3.9317386150360107, 'learning_rate': 2.5996115501697694e-05, 'epoch': 4.16}\n",
-      "{'loss': 0.6378, 'grad_norm': 3.794434070587158, 'learning_rate': 2.5541749944535554e-05, 'epoch': 4.18}\n",
-      "{'loss': 0.6153, 'grad_norm': 4.012321472167969, 'learning_rate': 2.5090024106057962e-05, 'epoch': 4.19}\n",
-      "{'loss': 0.6922, 'grad_norm': 4.712143898010254, 'learning_rate': 2.464098673992205e-05, 'epoch': 4.21}\n",
-      "{'loss': 0.6079, 'grad_norm': 5.002867698669434, 'learning_rate': 2.4194686309624663e-05, 'epoch': 4.23}\n",
-      "{'loss': 0.7554, 'grad_norm': 6.034168720245361, 'learning_rate': 2.3751170983272e-05, 'epoch': 4.25}\n",
-      "{'loss': 0.6634, 'grad_norm': 5.4491376876831055, 'learning_rate': 2.3310488628380757e-05, 'epoch': 4.27}\n",
-      "{'loss': 0.6635, 'grad_norm': 6.335705280303955, 'learning_rate': 2.2872686806712035e-05, 'epoch': 4.28}\n",
-      "{'loss': 0.6732, 'grad_norm': 4.363458633422852, 'learning_rate': 2.243781276913811e-05, 'epoch': 4.3}\n",
-      "{'loss': 0.5751, 'grad_norm': 4.058308124542236, 'learning_rate': 2.200591345054267e-05, 'epoch': 4.32}\n",
-      "{'loss': 0.7378, 'grad_norm': 5.493106365203857, 'learning_rate': 2.157703546475539e-05, 'epoch': 4.34}\n",
-      "{'loss': 0.6231, 'grad_norm': 4.587257385253906, 'learning_rate': 2.115122509952085e-05, 'epoch': 4.36}\n",
-      "{'loss': 0.6361, 'grad_norm': 4.070307731628418, 'learning_rate': 2.0728528311502976e-05, 'epoch': 4.37}\n",
-      "{'loss': 0.7245, 'grad_norm': 5.507742404937744, 'learning_rate': 2.0308990721324927e-05, 'epoch': 4.39}\n",
-      "{'loss': 0.6516, 'grad_norm': 4.98870849609375, 'learning_rate': 1.989265760864542e-05, 'epoch': 4.41}\n",
-      "{'loss': 0.7311, 'grad_norm': 4.5378618240356445, 'learning_rate': 1.947957390727185e-05, 'epoch': 4.43}\n",
-      "{'loss': 0.713, 'grad_norm': 6.595687389373779, 'learning_rate': 1.906978420031059e-05, 'epoch': 4.44}\n",
-      "{'loss': 0.5884, 'grad_norm': 3.995753765106201, 'learning_rate': 1.8663332715355396e-05, 'epoch': 4.46}\n",
-      "{'loss': 0.7598, 'grad_norm': 3.745181083679199, 'learning_rate': 1.8260263319713844e-05, 'epoch': 4.48}\n",
-      "{'loss': 0.673, 'grad_norm': 5.82590389251709, 'learning_rate': 1.7860619515673033e-05, 'epoch': 4.5}\n",
-      "{'loss': 0.6552, 'grad_norm': 5.151037216186523, 'learning_rate': 1.746444443580433e-05, 'epoch': 4.52}\n",
-      "{'loss': 0.7091, 'grad_norm': 5.6730499267578125, 'learning_rate': 1.7071780838308288e-05, 'epoch': 4.53}\n",
-      "{'loss': 0.7061, 'grad_norm': 5.2298502922058105, 'learning_rate': 1.6682671102399805e-05, 'epoch': 4.55}\n",
-      "{'loss': 0.654, 'grad_norm': 5.273619651794434, 'learning_rate': 1.629715722373423e-05, 'epoch': 4.57}\n",
-      "{'loss': 0.6822, 'grad_norm': 6.466513633728027, 'learning_rate': 1.5915280809874932e-05, 'epoch': 4.59}\n",
-      "{'loss': 0.5785, 'grad_norm': 6.050833702087402, 'learning_rate': 1.553708307580265e-05, 'epoch': 4.61}\n",
-      "{'loss': 0.6877, 'grad_norm': 6.502690315246582, 'learning_rate': 1.5162604839467265e-05, 'epoch': 4.62}\n",
-      "{'loss': 0.6984, 'grad_norm': 4.606260299682617, 'learning_rate': 1.4791886517382413e-05, 'epoch': 4.64}\n",
-      "{'loss': 0.6909, 'grad_norm': 4.80437707901001, 'learning_rate': 1.4424968120263504e-05, 'epoch': 4.66}\n",
-      "{'loss': 0.6827, 'grad_norm': 5.20365047454834, 'learning_rate': 1.4061889248709343e-05, 'epoch': 4.68}\n",
-      "{'loss': 0.6361, 'grad_norm': 6.29000997543335, 'learning_rate': 1.370268908892825e-05, 'epoch': 4.69}\n",
-      "{'loss': 0.6747, 'grad_norm': 4.9368438720703125, 'learning_rate': 1.3347406408508695e-05, 'epoch': 4.71}\n",
-      "{'loss': 0.6435, 'grad_norm': 5.528055667877197, 'learning_rate': 1.2996079552235263e-05, 'epoch': 4.73}\n",
-      "{'loss': 0.6501, 'grad_norm': 4.367548942565918, 'learning_rate': 1.264874643795021e-05, 'epoch': 4.75}\n",
-      "{'loss': 0.6376, 'grad_norm': 4.568158149719238, 'learning_rate': 1.230544455246101e-05, 'epoch': 4.77}\n",
-      "{'loss': 0.7034, 'grad_norm': 5.3214287757873535, 'learning_rate': 1.1966210947494583e-05, 'epoch': 4.78}\n",
-      "{'loss': 0.7303, 'grad_norm': 3.9356067180633545, 'learning_rate': 1.1631082235698316e-05, 'epoch': 4.8}\n",
-      "{'loss': 0.6436, 'grad_norm': 5.198613166809082, 'learning_rate': 1.130009458668863e-05, 'epoch': 4.82}\n",
-      "{'loss': 0.595, 'grad_norm': 4.129484176635742, 'learning_rate': 1.097328372314721e-05, 'epoch': 4.84}\n",
-      "{'loss': 0.6302, 'grad_norm': 8.324830055236816, 'learning_rate': 1.0650684916965559e-05, 'epoch': 4.85}\n",
-      "{'loss': 0.7296, 'grad_norm': 5.789163589477539, 'learning_rate': 1.0332332985438248e-05, 'epoch': 4.87}\n",
-      "{'loss': 0.7077, 'grad_norm': 5.632966995239258, 'learning_rate': 1.0018262287505086e-05, 'epoch': 4.89}\n",
-      "{'loss': 0.7339, 'grad_norm': 4.699968338012695, 'learning_rate': 9.708506720042932e-06, 'epoch': 4.91}\n",
-      "{'loss': 0.6334, 'grad_norm': 3.969327926635742, 'learning_rate': 9.403099714207175e-06, 'epoch': 4.93}\n",
-      "{'loss': 0.7298, 'grad_norm': 4.980201244354248, 'learning_rate': 9.102074231823727e-06, 'epoch': 4.94}\n",
-      "{'loss': 0.7236, 'grad_norm': 6.4100565910339355, 'learning_rate': 8.805462761831418e-06, 'epoch': 4.96}\n",
-      "{'loss': 0.7751, 'grad_norm': 5.446720600128174, 'learning_rate': 8.513297316775625e-06, 'epoch': 4.98}\n",
-      "{'loss': 0.7407, 'grad_norm': 4.180345058441162, 'learning_rate': 8.225609429353187e-06, 'epoch': 5.0}\n",
-      " 83%|██████████████████████████████▊      | 2800/3360 [1:21:52<16:57,  1.82s/it][INFO|trainer.py:3788] 2024-07-04 13:48:40,919 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 13:48:40,919 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 13:48:40,919 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:01, 28.49it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:01, 21.83it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:00<00:01, 20.40it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:00<00:01, 19.64it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:00<00:01, 19.60it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:00<00:01, 19.23it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:00<00:01, 18.37it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:01<00:01, 18.58it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:01<00:01, 18.77it/s]\u001b[A\n",
-      " 54%|████████████��██████████▎                   | 25/46 [00:01<00:01, 18.26it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:01<00:01, 18.05it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:01<00:00, 17.15it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:01<00:00, 15.93it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:01<00:00, 16.53it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:01<00:00, 15.83it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:02<00:00, 16.58it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:02<00:00, 17.08it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:02<00:00, 17.52it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:02<00:00, 17.67it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.25398850440979, 'eval_runtime': 2.5926, 'eval_samples_per_second': 17.743, 'eval_steps_per_second': 17.743, 'epoch': 5.0}\n",
-      " 83%|██████████████████████████████▊      | 2800/3360 [1:21:54<16:57,  1.82s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:02<00:00, 17.88it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 13:48:43,514 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-2800\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 13:48:44,254 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 13:48:44,254 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 13:48:44,307 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-2800/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 13:48:44,307 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-2800/special_tokens_map.json\n",
-      "{'loss': 0.5816, 'grad_norm': 3.6714768409729004, 'learning_rate': 7.942430149009161e-06, 'epoch': 5.02}\n",
-      "{'loss': 0.516, 'grad_norm': 3.9655864238739014, 'learning_rate': 7.663790038585793e-06, 'epoch': 5.03}\n",
-      "{'loss': 0.5876, 'grad_norm': 4.2771453857421875, 'learning_rate': 7.389719171023857e-06, 'epoch': 5.05}\n",
-      "{'loss': 0.5746, 'grad_norm': 5.545507431030273, 'learning_rate': 7.1202471261170245e-06, 'epoch': 5.07}\n",
-      "{'loss': 0.5789, 'grad_norm': 4.685436248779297, 'learning_rate': 6.855402987319348e-06, 'epoch': 5.09}\n",
-      "{'loss': 0.6558, 'grad_norm': 5.384147644042969, 'learning_rate': 6.595215338606397e-06, 'epoch': 5.1}\n",
-      "{'loss': 0.4955, 'grad_norm': 5.300227642059326, 'learning_rate': 6.339712261390213e-06, 'epoch': 5.12}\n",
-      "{'loss': 0.6284, 'grad_norm': 5.341045379638672, 'learning_rate': 6.088921331488568e-06, 'epoch': 5.14}\n",
-      "{'loss': 0.5285, 'grad_norm': 4.509070873260498, 'learning_rate': 5.8428696161488215e-06, 'epoch': 5.16}\n",
-      "{'loss': 0.5073, 'grad_norm': 4.6753339767456055, 'learning_rate': 5.601583671126531e-06, 'epoch': 5.18}\n",
-      "{'loss': 0.5849, 'grad_norm': 3.836711883544922, 'learning_rate': 5.365089537819434e-06, 'epoch': 5.19}\n",
-      "{'loss': 0.5714, 'grad_norm': 4.124776840209961, 'learning_rate': 5.133412740456806e-06, 'epoch': 5.21}\n",
-      "{'loss': 0.5811, 'grad_norm': 4.734057903289795, 'learning_rate': 4.906578283344759e-06, 'epoch': 5.23}\n",
-      "{'loss': 0.6351, 'grad_norm': 5.501781463623047, 'learning_rate': 4.684610648167503e-06, 'epoch': 5.25}\n",
-      "{'loss': 0.5622, 'grad_norm': 4.912986755371094, 'learning_rate': 4.467533791345191e-06, 'epoch': 5.27}\n",
-      "{'loss': 0.5305, 'grad_norm': 6.3503899574279785, 'learning_rate': 4.255371141448272e-06, 'epoch': 5.28}\n",
-      "{'loss': 0.5406, 'grad_norm': 4.923576354980469, 'learning_rate': 4.048145596668967e-06, 'epoch': 5.3}\n",
-      "{'loss': 0.5534, 'grad_norm': 4.20800256729126, 'learning_rate': 3.84587952234991e-06, 'epoch': 5.32}\n",
-      "{'loss': 0.4831, 'grad_norm': 4.633558750152588, 'learning_rate': 3.6485947485702832e-06, 'epoch': 5.34}\n",
-      "{'loss': 0.5587, 'grad_norm': 5.120583534240723, 'learning_rate': 3.4563125677897932e-06, 'epoch': 5.35}\n",
-      "{'loss': 0.5696, 'grad_norm': 5.966647148132324, 'learning_rate': 3.269053732550581e-06, 'epoch': 5.37}\n",
-      "{'loss': 0.5767, 'grad_norm': 5.047117233276367, 'learning_rate': 3.086838453237506e-06, 'epoch': 5.39}\n",
-      "{'loss': 0.4262, 'grad_norm': 4.478403091430664, 'learning_rate': 2.9096863958968268e-06, 'epoch': 5.41}\n",
-      "{'loss': 0.4798, 'grad_norm': 4.455025672912598, 'learning_rate': 2.737616680113758e-06, 'epoch': 5.43}\n",
-      "{'loss': 0.4574, 'grad_norm': 3.7917206287384033, 'learning_rate': 2.570647876948895e-06, 'epoch': 5.44}\n",
-      "{'loss': 0.5635, 'grad_norm': 7.098059177398682, 'learning_rate': 2.408798006933882e-06, 'epoch': 5.46}\n",
-      "{'loss': 0.7231, 'grad_norm': 4.642895698547363, 'learning_rate': 2.252084538126542e-06, 'epoch': 5.48}\n",
-      "{'loss': 0.5122, 'grad_norm': 5.233055591583252, 'learning_rate': 2.100524384225555e-06, 'epoch': 5.5}\n",
-      "{'loss': 0.524, 'grad_norm': 4.6845173835754395, 'learning_rate': 1.9541339027450256e-06, 'epoch': 5.52}\n",
-      "{'loss': 0.5816, 'grad_norm': 5.447011470794678, 'learning_rate': 1.8129288932490274e-06, 'epoch': 5.53}\n",
-      "{'loss': 0.5329, 'grad_norm': 3.755023717880249, 'learning_rate': 1.6769245956464396e-06, 'epoch': 5.55}\n",
-      "{'loss': 0.6767, 'grad_norm': 5.255481719970703, 'learning_rate': 1.5461356885461075e-06, 'epoch': 5.57}\n",
-      "{'loss': 0.5529, 'grad_norm': 4.8336567878723145, 'learning_rate': 1.4205762876726092e-06, 'epoch': 5.59}\n",
-      "{'loss': 0.6372, 'grad_norm': 5.332770824432373, 'learning_rate': 1.3002599443428243e-06, 'epoch': 5.6}\n",
-      "{'loss': 0.634, 'grad_norm': 5.157808780670166, 'learning_rate': 1.1851996440033319e-06, 'epoch': 5.62}\n",
-      "{'loss': 0.5033, 'grad_norm': 4.826900005340576, 'learning_rate': 1.0754078048289374e-06, 'epoch': 5.64}\n",
-      "{'loss': 0.5681, 'grad_norm': 3.9047048091888428, 'learning_rate': 9.708962763824048e-07, 'epoch': 5.66}\n",
-      "{'loss': 0.5432, 'grad_norm': 6.038053512573242, 'learning_rate': 8.716763383355864e-07, 'epoch': 5.68}\n",
-      "{'loss': 0.6018, 'grad_norm': 5.233924388885498, 'learning_rate': 7.777586992519959e-07, 'epoch': 5.69}\n",
-      "{'loss': 0.5367, 'grad_norm': 6.929383277893066, 'learning_rate': 6.891534954310885e-07, 'epoch': 5.71}\n",
-      "{'loss': 0.6039, 'grad_norm': 4.509579181671143, 'learning_rate': 6.058702898142643e-07, 'epoch': 5.73}\n",
-      "{'loss': 0.5292, 'grad_norm': 4.131773948669434, 'learning_rate': 5.279180709527765e-07, 'epoch': 5.75}\n",
-      "{'loss': 0.6327, 'grad_norm': 4.368628025054932, 'learning_rate': 4.553052520375911e-07, 'epoch': 5.77}\n",
-      "{'loss': 0.4859, 'grad_norm': 4.966446399688721, 'learning_rate': 3.8803966999139684e-07, 'epoch': 5.78}\n",
-      "{'loss': 0.5397, 'grad_norm': 5.083605766296387, 'learning_rate': 3.261285846227868e-07, 'epoch': 5.8}\n",
-      "{'loss': 0.4758, 'grad_norm': 4.257706165313721, 'learning_rate': 2.6957867784270787e-07, 'epoch': 5.82}\n",
-      "{'loss': 0.492, 'grad_norm': 5.183888912200928, 'learning_rate': 2.1839605294330933e-07, 'epoch': 5.84}\n",
-      "{'loss': 0.6466, 'grad_norm': 7.4429707527160645, 'learning_rate': 1.725862339392259e-07, 'epoch': 5.85}\n",
-      "{'loss': 0.4461, 'grad_norm': 6.51588249206543, 'learning_rate': 1.3215416497138754e-07, 'epoch': 5.87}\n",
-      "{'loss': 0.6614, 'grad_norm': 4.2303786277771, 'learning_rate': 9.710420977340762e-08, 'epoch': 5.89}\n",
-      "{'loss': 0.4817, 'grad_norm': 6.3713908195495605, 'learning_rate': 6.744015120061509e-08, 'epoch': 5.91}\n",
-      "{'loss': 0.6231, 'grad_norm': 10.188394546508789, 'learning_rate': 4.316519082179227e-08, 'epoch': 5.93}\n",
-      "{'loss': 0.5204, 'grad_norm': 4.387541770935059, 'learning_rate': 2.4281948573617874e-08, 'epoch': 5.94}\n",
-      "{'loss': 0.5938, 'grad_norm': 5.084804534912109, 'learning_rate': 1.0792462477909882e-08, 'epoch': 5.96}\n",
-      "{'loss': 0.576, 'grad_norm': 4.955512523651123, 'learning_rate': 2.6981884216847884e-09, 'epoch': 5.98}\n",
-      "{'loss': 0.5752, 'grad_norm': 4.1065449714660645, 'learning_rate': 0.0, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [1:38:23<00:00,  1.77s/it][INFO|trainer.py:3788] 2024-07-04 14:05:12,056 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 14:05:12,056 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 14:05:12,056 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:01, 29.94it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:01, 23.42it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:00<00:01, 21.97it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:00<00:01, 21.18it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:00<00:01, 20.59it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:00<00:01, 20.24it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:01<00:01, 19.57it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:01<00:01, 19.09it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:01<00:01, 19.28it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:01<00:00, 19.49it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:01<00:00, 19.59it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:01<00:00, 19.65it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:01<00:00, 19.62it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:01<00:00, 18.99it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:02<00:00, 18.90it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:02<00:00, 18.97it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:02<00:00, 19.10it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.437338352203369, 'eval_runtime': 2.3741, 'eval_samples_per_second': 19.376, 'eval_steps_per_second': 19.376, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [1:38:25<00:00,  1.77s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:02<00:00, 19.10it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 14:05:14,432 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-3360\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 14:05:15,110 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 14:05:15,111 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 14:05:15,155 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-3360/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 14:05:15,155 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-3360/special_tokens_map.json\n",
-      "[INFO|trainer.py:2383] 2024-07-04 14:05:15,382 >> \n",
-      "\n",
-      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
-      "\n",
-      "\n",
-      "{'train_runtime': 5911.7152, 'train_samples_per_second': 4.549, 'train_steps_per_second': 0.568, 'train_loss': 1.1251599807114827, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [1:38:26<00:00,  1.76s/it]\n",
-      "[INFO|trainer.py:3478] 2024-07-04 14:05:15,386 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 14:05:16,251 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 14:05:16,251 >> Model config Qwen2Config {\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 14:05:16,306 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 14:05:16,306 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/special_tokens_map.json\n",
-      "***** train metrics *****\n",
-      "  epoch                    =     5.9973\n",
-      "  total_flos               = 16732846GF\n",
-      "  train_loss               =     1.1252\n",
-      "  train_runtime            = 1:38:31.71\n",
-      "  train_samples_per_second =      4.549\n",
-      "  train_steps_per_second   =      0.568\n",
-      "Figure saved at: saves/qwen2-1.5b/lora/sft/training_loss.png\n",
-      "Figure saved at: saves/qwen2-1.5b/lora/sft/training_eval_loss.png\n",
-      "[INFO|trainer.py:3788] 2024-07-04 14:05:16,625 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 14:05:16,625 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 14:05:16,625 >>   Batch size = 1\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:02<00:00, 18.96it/s]\n",
-      "***** eval metrics *****\n",
-      "  epoch                   =     5.9973\n",
-      "  eval_loss               =     2.4373\n",
-      "  eval_runtime            = 0:00:02.50\n",
-      "  eval_samples_per_second =     18.363\n",
-      "  eval_steps_per_second   =     18.363\n",
-      "[INFO|modelcard.py:449] 2024-07-04 14:05:19,133 >> Dropping the following result as it does not have all the necessary fields:\n",
-      "{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \\ 0.086 MB of 0.086 MB uploaded\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               eval/loss ▁▁▂▄▆██\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:            eval/runtime ▃▄▁▅█▁▅\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: eval/samples_per_second ▆▅█▄▁█▄\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:   eval/steps_per_second ▆▅█▄▁█▄\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:             train/epoch ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:       train/global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:         train/grad_norm ▃▁▂▂▁▃▂▃▄▄▃▅▃▃▅▅▅▃▄▄▄▆▅▇▅▆▄▅█▅▇▆▅▇▅▆▆▆▆▅\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:     train/learning_rate ▂▄▅▇██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/loss █▇▇▇▆▆▆▆▆▆▅▅▆▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:                eval/loss 2.43734\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:             eval/runtime 2.5051\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:  eval/samples_per_second 18.363\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:    eval/steps_per_second 18.363\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               total_flos 1.7966756916707328e+16\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/epoch 5.99732\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:        train/global_step 3360\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:          train/grad_norm 4.10654\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:      train/learning_rate 0.0\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               train/loss 0.5752\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               train_loss 1.12516\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:            train_runtime 5911.7152\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: train_samples_per_second 4.549\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:   train_steps_per_second 0.568\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mqwen2_1.5b_lora_sft\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface/runs/mpc5sxtf\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at: \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 6 W&B file(s), 0 media file(s), 1 artifact file(s) and 0 other file(s)\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240704_122645-mpc5sxtf/logs\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The new W&B backend becomes opt-out in version 0.18.0; try it out with `wandb.require(\"core\")`! See https://wandb.me/wandb-core for more information.\n",
-      "CPU times: user 1min 28s, sys: 26.5 s, total: 1min 54s\n",
-      "Wall time: 1h 42min 32s\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "\n",
-    "!./scripts/tune-lf.sh config/qwen2_1.5b_lora_sft.yaml"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Current Directory:\n",
-      "/home/inflaton/code/projects/courses/llm-finetuning/llama-factory\n",
-      "07/04/2024 14:50:13 - WARNING - llamafactory.hparams.parser - We recommend enable `upcast_layernorm` in quantized training.\n",
-      "07/04/2024 14:50:13 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 14:50:14,466 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-7B-Instruct/snapshots/41c66b0be1c3081f13defc6bdf946c2ef240d6a6/vocab.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 14:50:14,466 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-7B-Instruct/snapshots/41c66b0be1c3081f13defc6bdf946c2ef240d6a6/merges.txt\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 14:50:14,466 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-7B-Instruct/snapshots/41c66b0be1c3081f13defc6bdf946c2ef240d6a6/tokenizer.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 14:50:14,466 >> loading file added_tokens.json from cache at None\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 14:50:14,466 >> loading file special_tokens_map.json from cache at None\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 14:50:14,467 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-7B-Instruct/snapshots/41c66b0be1c3081f13defc6bdf946c2ef240d6a6/tokenizer_config.json\n",
-      "[WARNING|logging.py:313] 2024-07-04 14:50:14,635 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-      "07/04/2024 14:50:14 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>\n",
-      "07/04/2024 14:50:14 - INFO - llamafactory.data.template - Add <|im_start|> to stop words.\n",
-      "07/04/2024 14:50:14 - INFO - llamafactory.data.loader - Loading dataset alpaca_mac.json...\n",
-      "Converting format of dataset (num_proc=16): 100%|█| 4528/4528 [00:00<00:00, 1650\n",
-      "Running tokenizer on dataset (num_proc=16): 100%|█| 4528/4528 [00:01<00:00, 3163\n",
-      "input_ids:\n",
-      "[151644, 872, 198, 5501, 14683, 279, 2701, 8453, 1467, 1119, 6364, 323, 3410, 1172, 279, 24531, 2213, 11, 4302, 770, 624, 35987, 102895, 99164, 100324, 100717, 100095, 99509, 1773, 151645, 198, 151644, 77091, 198, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n",
-      "inputs:\n",
-      "<|im_start|>user\n",
-      "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n",
-      "全仗着狐仙搭救。<|im_end|>\n",
-      "<|im_start|>assistant\n",
-      "Because I was protected by a fox fairy.<|im_end|>\n",
-      "label_ids:\n",
-      "[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n",
-      "labels:\n",
-      "Because I was protected by a fox fairy.<|im_end|>\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 14:50:17,794 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-7B-Instruct/snapshots/41c66b0be1c3081f13defc6bdf946c2ef240d6a6/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 14:50:17,795 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-7B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 3584,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 18944,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 28,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 4,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 131072,\n",
-      "  \"tie_word_embeddings\": false,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 152064\n",
-      "}\n",
-      "\n",
-      "07/04/2024 14:50:17 - INFO - llamafactory.model.model_utils.quantization - Quantizing model to 4 bit with bitsandbytes.\n",
-      "🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n",
-      "config.json: 100%|█████████████████████████| 1.19k/1.19k [00:00<00:00, 12.3MB/s]\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 14:50:19,202 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 14:50:19,203 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"unsloth/qwen2-7b-instruct-bnb-4bit\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 3584,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 18944,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 28,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 4,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 131072,\n",
-      "  \"tie_word_embeddings\": false,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 152064\n",
-      "}\n",
-      "\n",
-      "==((====))==  Unsloth: Fast Qwen2 patching release 2024.6\n",
-      "   \\\\   /|    GPU: NVIDIA GeForce RTX 4080 Laptop GPU. Max memory: 11.994 GB. Platform = Linux.\n",
-      "O^O/ \\_/ \\    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.\n",
-      "\\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.\n",
-      " \"-____-\"     Free Apache license: http://github.com/unslothai/unsloth\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 14:50:20,339 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 14:50:20,340 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"unsloth/qwen2-7b-instruct-bnb-4bit\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 3584,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 18944,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 28,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 4,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 131072,\n",
-      "  \"tie_word_embeddings\": false,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 152064\n",
-      "}\n",
-      "\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 14:50:20,992 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 14:50:20,993 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"unsloth/qwen2-7b-instruct-bnb-4bit\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 3584,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 18944,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 28,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 4,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 131072,\n",
-      "  \"tie_word_embeddings\": false,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 152064\n",
-      "}\n",
-      "\n",
-      "model.safetensors: 100%|███████████████████| 5.55G/5.55G [31:00<00:00, 2.98MB/s]\n",
-      "[INFO|modeling_utils.py:3556] 2024-07-04 15:21:22,487 >> loading weights file model.safetensors from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/model.safetensors\n",
-      "[INFO|modeling_utils.py:1531] 2024-07-04 15:21:26,212 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.\n",
-      "[INFO|configuration_utils.py:1000] 2024-07-04 15:21:26,219 >> Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645\n",
-      "}\n",
-      "\n",
-      "[INFO|modeling_utils.py:4364] 2024-07-04 15:26:00,017 >> All model checkpoint weights were used when initializing Qwen2ForCausalLM.\n",
-      "\n",
-      "[INFO|modeling_utils.py:4372] 2024-07-04 15:26:00,018 >> All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at unsloth/qwen2-7b-instruct-bnb-4bit.\n",
-      "If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.\n",
-      "generation_config.json: 100%|██████████████████| 243/243 [00:00<00:00, 3.75MB/s]\n",
-      "[INFO|configuration_utils.py:955] 2024-07-04 15:26:01,541 >> loading configuration file generation_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/generation_config.json\n",
-      "[INFO|configuration_utils.py:1000] 2024-07-04 15:26:01,542 >> Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"do_sample\": true,\n",
-      "  \"eos_token_id\": [\n",
-      "    151645,\n",
-      "    151643\n",
-      "  ],\n",
-      "  \"pad_token_id\": 151643,\n",
-      "  \"repetition_penalty\": 1.05,\n",
-      "  \"temperature\": 0.7,\n",
-      "  \"top_k\": 20,\n",
-      "  \"top_p\": 0.8\n",
-      "}\n",
-      "\n",
-      "tokenizer_config.json: 100%|███████████████| 1.33k/1.33k [00:00<00:00, 19.0MB/s]\n",
-      "vocab.json: 100%|██████████████████████████| 2.78M/2.78M [00:01<00:00, 1.75MB/s]\n",
-      "merges.txt: 100%|██████████████████████████| 1.67M/1.67M [00:00<00:00, 1.89MB/s]\n",
-      "added_tokens.json: 100%|█████████████████████| 80.0/80.0 [00:00<00:00, 1.29MB/s]\n",
-      "special_tokens_map.json: 100%|█████████████████| 367/367 [00:00<00:00, 6.11MB/s]\n",
-      "tokenizer.json: 100%|██████████████████████| 7.03M/7.03M [00:02<00:00, 3.09MB/s]\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 15:26:12,737 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/vocab.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 15:26:12,737 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/merges.txt\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 15:26:12,737 >> loading file added_tokens.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/added_tokens.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 15:26:12,737 >> loading file special_tokens_map.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/special_tokens_map.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 15:26:12,737 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 15:26:12,737 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/tokenizer.json\n",
-      "[WARNING|logging.py:313] 2024-07-04 15:26:12,946 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 15:26:13,696 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/vocab.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 15:26:13,696 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/merges.txt\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 15:26:13,696 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/tokenizer.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 15:26:13,696 >> loading file added_tokens.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/added_tokens.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 15:26:13,696 >> loading file special_tokens_map.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/special_tokens_map.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 15:26:13,696 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/tokenizer_config.json\n",
-      "[WARNING|logging.py:313] 2024-07-04 15:26:13,877 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-      "07/04/2024 15:26:14 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.\n",
-      "07/04/2024 15:26:14 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.\n",
-      "07/04/2024 15:26:14 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA\n",
-      "07/04/2024 15:26:14 - INFO - llamafactory.model.model_utils.misc - Found linear modules: gate_proj,o_proj,v_proj,k_proj,up_proj,q_proj,down_proj\n",
-      "[WARNING|logging.py:328] 2024-07-04 15:26:15,372 >> Unsloth 2024.6 patched 28 layers with 0 QKV layers, 28 O layers and 28 MLP layers.\n",
-      "07/04/2024 15:26:16 - INFO - llamafactory.model.loader - trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.2643\n",
-      "[INFO|trainer.py:642] 2024-07-04 15:26:16,270 >> Using auto half precision backend\n",
-      "07/04/2024 15:26:16 - INFO - llamafactory.train.trainer_utils - Using LoRA+ optimizer with loraplus lr ratio 16.00.\n",
-      "[WARNING|<string>:223] 2024-07-04 15:26:16,423 >> ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1\n",
-      "   \\\\   /|    Num examples = 4,482 | Num Epochs = 6\n",
-      "O^O/ \\_/ \\    Batch size per device = 1 | Gradient Accumulation steps = 8\n",
-      "\\        /    Total batch size = 8 | Total steps = 3,360\n",
-      " \"-____-\"     Number of trainable parameters = 20,185,088\n",
-      "[INFO|integration_utils.py:750] 2024-07-04 15:26:16,929 >> Automatic Weights & Biases logging enabled, to disable set os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33minflaton-sg\u001b[0m (\u001b[33minflaton-ai\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.17.4\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m/home/inflaton/code/projects/courses/llm-finetuning/llama-factory/wandb/run-20240704_152618-o710838e\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mqwen2_7b_lora_sft\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface/runs/o710838e\u001b[0m\n",
-      "{'loss': 2.1957, 'grad_norm': 2.977725028991699, 'learning_rate': 2.9761904761904763e-06, 'epoch': 0.02}\n",
-      "{'loss': 1.9984, 'grad_norm': 1.17664635181427, 'learning_rate': 5.9523809523809525e-06, 'epoch': 0.04}\n",
-      "{'loss': 1.7375, 'grad_norm': 0.7683635354042053, 'learning_rate': 8.92857142857143e-06, 'epoch': 0.05}\n",
-      "{'loss': 1.7268, 'grad_norm': 1.5277972221374512, 'learning_rate': 1.1904761904761905e-05, 'epoch': 0.07}\n",
-      "{'loss': 1.7873, 'grad_norm': 0.7151318788528442, 'learning_rate': 1.4880952380952381e-05, 'epoch': 0.09}\n",
-      "{'loss': 1.6224, 'grad_norm': 0.7458081841468811, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.11}\n",
-      "{'loss': 1.7345, 'grad_norm': 0.7242929339408875, 'learning_rate': 2.0833333333333336e-05, 'epoch': 0.12}\n",
-      "{'loss': 1.57, 'grad_norm': 0.8281179666519165, 'learning_rate': 2.380952380952381e-05, 'epoch': 0.14}\n",
-      "{'loss': 1.6718, 'grad_norm': 1.0110186338424683, 'learning_rate': 2.6785714285714288e-05, 'epoch': 0.16}\n",
-      "{'loss': 1.6219, 'grad_norm': 0.8258731961250305, 'learning_rate': 2.9761904761904762e-05, 'epoch': 0.18}\n",
-      "{'loss': 1.6115, 'grad_norm': 0.9346244931221008, 'learning_rate': 3.273809523809524e-05, 'epoch': 0.2}\n",
-      "{'loss': 1.6081, 'grad_norm': 1.0503712892532349, 'learning_rate': 3.571428571428572e-05, 'epoch': 0.21}\n",
-      "{'loss': 1.5874, 'grad_norm': 1.1157383918762207, 'learning_rate': 3.8690476190476195e-05, 'epoch': 0.23}\n",
-      "{'loss': 1.5825, 'grad_norm': 1.212875485420227, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.25}\n",
-      "{'loss': 1.512, 'grad_norm': 1.073009967803955, 'learning_rate': 4.464285714285715e-05, 'epoch': 0.27}\n",
-      "{'loss': 1.5074, 'grad_norm': 0.8304378390312195, 'learning_rate': 4.761904761904762e-05, 'epoch': 0.29}\n",
-      "{'loss': 1.6019, 'grad_norm': 0.8581129908561707, 'learning_rate': 5.05952380952381e-05, 'epoch': 0.3}\n",
-      "{'loss': 1.4319, 'grad_norm': 1.027076244354248, 'learning_rate': 5.3571428571428575e-05, 'epoch': 0.32}\n",
-      "{'loss': 1.555, 'grad_norm': 1.3464545011520386, 'learning_rate': 5.6547619047619046e-05, 'epoch': 0.34}\n",
-      "{'loss': 1.6637, 'grad_norm': 1.2714892625808716, 'learning_rate': 5.9523809523809524e-05, 'epoch': 0.36}\n",
-      "{'loss': 1.6159, 'grad_norm': 1.0014649629592896, 'learning_rate': 6.25e-05, 'epoch': 0.37}\n",
-      "{'loss': 1.5019, 'grad_norm': 1.4355653524398804, 'learning_rate': 6.547619047619048e-05, 'epoch': 0.39}\n",
-      "{'loss': 1.5167, 'grad_norm': 1.2876572608947754, 'learning_rate': 6.845238095238096e-05, 'epoch': 0.41}\n",
-      "{'loss': 1.6807, 'grad_norm': 1.4459688663482666, 'learning_rate': 7.142857142857143e-05, 'epoch': 0.43}\n",
-      "{'loss': 1.6053, 'grad_norm': 1.7381216287612915, 'learning_rate': 7.440476190476191e-05, 'epoch': 0.45}\n",
-      "{'loss': 1.4993, 'grad_norm': 1.516874074935913, 'learning_rate': 7.738095238095239e-05, 'epoch': 0.46}\n",
-      "{'loss': 1.58, 'grad_norm': 1.7755393981933594, 'learning_rate': 8.035714285714287e-05, 'epoch': 0.48}\n",
-      "{'loss': 1.5699, 'grad_norm': 1.7302135229110718, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.5}\n",
-      "{'loss': 1.419, 'grad_norm': 1.172330617904663, 'learning_rate': 8.630952380952382e-05, 'epoch': 0.52}\n",
-      "{'loss': 1.5505, 'grad_norm': 1.676744818687439, 'learning_rate': 8.92857142857143e-05, 'epoch': 0.54}\n",
-      "{'loss': 1.5749, 'grad_norm': 1.8019312620162964, 'learning_rate': 9.226190476190478e-05, 'epoch': 0.55}\n",
-      "{'loss': 1.4185, 'grad_norm': 2.2339751720428467, 'learning_rate': 9.523809523809524e-05, 'epoch': 0.57}\n",
-      "{'loss': 1.4871, 'grad_norm': 1.8845446109771729, 'learning_rate': 9.821428571428572e-05, 'epoch': 0.59}\n",
-      "{'loss': 1.4547, 'grad_norm': 1.5382771492004395, 'learning_rate': 9.999956828659095e-05, 'epoch': 0.61}\n",
-      "{'loss': 1.5409, 'grad_norm': 2.5924744606018066, 'learning_rate': 9.999471159635539e-05, 'epoch': 0.62}\n",
-      "{'loss': 1.4544, 'grad_norm': 1.6850535869598389, 'learning_rate': 9.998445910004082e-05, 'epoch': 0.64}\n",
-      "{'loss': 1.6716, 'grad_norm': 2.093435287475586, 'learning_rate': 9.996881190417393e-05, 'epoch': 0.66}\n",
-      "{'loss': 1.5389, 'grad_norm': 1.9192240238189697, 'learning_rate': 9.994777169751806e-05, 'epoch': 0.68}\n",
-      "{'loss': 1.3255, 'grad_norm': 1.820000410079956, 'learning_rate': 9.992134075089084e-05, 'epoch': 0.7}\n",
-      "{'loss': 1.4784, 'grad_norm': 1.8777908086776733, 'learning_rate': 9.988952191691925e-05, 'epoch': 0.71}\n",
-      "{'loss': 1.5354, 'grad_norm': 1.7081478834152222, 'learning_rate': 9.985231862973168e-05, 'epoch': 0.73}\n",
-      "{'loss': 1.5822, 'grad_norm': 1.6461598873138428, 'learning_rate': 9.980973490458728e-05, 'epoch': 0.75}\n",
-      "{'loss': 1.5233, 'grad_norm': 2.1327311992645264, 'learning_rate': 9.976177533744261e-05, 'epoch': 0.77}\n",
-      "{'loss': 1.4739, 'grad_norm': 2.4746365547180176, 'learning_rate': 9.97084451044556e-05, 'epoch': 0.79}\n",
-      "{'loss': 1.4276, 'grad_norm': 1.7821303606033325, 'learning_rate': 9.964974996142698e-05, 'epoch': 0.8}\n",
-      "{'loss': 1.4803, 'grad_norm': 1.551522970199585, 'learning_rate': 9.958569624317893e-05, 'epoch': 0.82}\n",
-      "{'loss': 1.5314, 'grad_norm': 2.6767489910125732, 'learning_rate': 9.951629086287151e-05, 'epoch': 0.84}\n",
-      "{'loss': 1.4844, 'grad_norm': 1.7266111373901367, 'learning_rate': 9.944154131125642e-05, 'epoch': 0.86}\n",
-      "{'loss': 1.5248, 'grad_norm': 1.7948070764541626, 'learning_rate': 9.936145565586871e-05, 'epoch': 0.87}\n",
-      "{'loss': 1.6563, 'grad_norm': 1.6244261264801025, 'learning_rate': 9.927604254015585e-05, 'epoch': 0.89}\n",
-      "{'loss': 1.5928, 'grad_norm': 1.7924832105636597, 'learning_rate': 9.918531118254507e-05, 'epoch': 0.91}\n",
-      "{'loss': 1.4955, 'grad_norm': 2.337216377258301, 'learning_rate': 9.90892713754483e-05, 'epoch': 0.93}\n",
-      "{'loss': 1.558, 'grad_norm': 2.165968179702759, 'learning_rate': 9.898793348420536e-05, 'epoch': 0.95}\n",
-      "{'loss': 1.5148, 'grad_norm': 1.7740817070007324, 'learning_rate': 9.888130844596524e-05, 'epoch': 0.96}\n",
-      "{'loss': 1.5339, 'grad_norm': 2.276500940322876, 'learning_rate': 9.876940776850569e-05, 'epoch': 0.98}\n",
-      "{'loss': 1.4748, 'grad_norm': 1.852982521057129, 'learning_rate': 9.865224352899119e-05, 'epoch': 1.0}\n",
-      " 17%|██████                              | 560/3360 [1:04:27<5:48:51,  7.48s/it][INFO|trainer.py:3788] 2024-07-04 16:30:50,001 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 16:30:50,003 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 16:30:50,003 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  4%|█▉                                          | 2/46 [00:00<00:04,  9.82it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:06,  6.46it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:00<00:08,  5.15it/s]\u001b[A\n",
-      " 11%|████▊                                       | 5/46 [00:00<00:08,  5.06it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:01<00:07,  5.10it/s]\u001b[A\n",
-      " 15%|██████▋                                     | 7/46 [00:01<00:08,  4.43it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:01<00:09,  4.07it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:01<00:09,  3.75it/s]\u001b[A\n",
-      " 22%|█████████▎                                 | 10/46 [00:02<00:10,  3.36it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:02<00:10,  3.20it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:03<00:10,  3.20it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:03<00:09,  3.38it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:03<00:09,  3.53it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:03<00:08,  3.54it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:04<00:09,  3.30it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:04<00:09,  3.03it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:05<00:10,  2.72it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:05<00:10,  2.64it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:05<00:10,  2.46it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:06<00:10,  2.36it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:06<00:09,  2.46it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:06<00:08,  2.76it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:07<00:06,  3.19it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:07<00:05,  3.75it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:07<00:04,  4.31it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:07<00:03,  4.93it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:07<00:03,  5.49it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:07<00:02,  5.92it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:08<00:02,  6.33it/s]\u001b[A\n",
-      " 67%|███████████��████████████████▉              | 31/46 [00:08<00:02,  6.71it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:08<00:02,  6.80it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:08<00:01,  6.67it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:08<00:01,  6.77it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:08<00:01,  6.46it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:08<00:01,  5.78it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:10<00:03,  2.28it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:11<00:05,  1.57it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:11<00:03,  1.87it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:11<00:02,  2.15it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:13<00:04,  1.13it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:13<00:02,  1.39it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:14<00:01,  1.80it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:14<00:00,  2.28it/s]\u001b[A\n",
-      " 98%|██████████████████████████████████████████ | 45/46 [00:14<00:00,  2.82it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 1.5079401731491089, 'eval_runtime': 14.9104, 'eval_samples_per_second': 3.085, 'eval_steps_per_second': 3.085, 'epoch': 1.0}\n",
-      " 17%|██████                              | 560/3360 [1:04:42<5:48:51,  7.48s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:14<00:00,  3.42it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 16:31:04,915 >> Saving model checkpoint to saves/qwen2-7b/lora/sft/checkpoint-560\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 16:31:06,164 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 16:31:06,165 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-7B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 3584,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 18944,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 28,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 4,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 131072,\n",
-      "  \"tie_word_embeddings\": false,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 152064\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 16:31:06,584 >> tokenizer config file saved in saves/qwen2-7b/lora/sft/checkpoint-560/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 16:31:06,585 >> Special tokens file saved in saves/qwen2-7b/lora/sft/checkpoint-560/special_tokens_map.json\n",
-      "{'loss': 1.0693, 'grad_norm': 2.4884133338928223, 'learning_rate': 9.852982837266955e-05, 'epoch': 1.02}\n",
-      "{'loss': 0.9156, 'grad_norm': 2.0713613033294678, 'learning_rate': 9.840217551150706e-05, 'epoch': 1.04}\n",
-      "{'loss': 1.0533, 'grad_norm': 2.160870313644409, 'learning_rate': 9.826929872276255e-05, 'epoch': 1.05}\n",
-      "{'loss': 0.9734, 'grad_norm': 2.251491069793701, 'learning_rate': 9.81312123475006e-05, 'epoch': 1.07}\n",
-      "{'loss': 0.925, 'grad_norm': 2.160745859146118, 'learning_rate': 9.798793128904356e-05, 'epoch': 1.09}\n",
-      "{'loss': 1.0312, 'grad_norm': 2.9455161094665527, 'learning_rate': 9.78394710113631e-05, 'epoch': 1.11}\n",
-      "{'loss': 0.9867, 'grad_norm': 1.834627628326416, 'learning_rate': 9.768584753741134e-05, 'epoch': 1.12}\n",
-      "{'loss': 0.9962, 'grad_norm': 2.891728401184082, 'learning_rate': 9.752707744739145e-05, 'epoch': 1.14}\n",
-      "{'loss': 1.0046, 'grad_norm': 2.459664821624756, 'learning_rate': 9.736317787696816e-05, 'epoch': 1.16}\n",
-      "{'loss': 0.872, 'grad_norm': 2.503146171569824, 'learning_rate': 9.719416651541839e-05, 'epoch': 1.18}\n",
-      "{'loss': 0.9536, 'grad_norm': 1.9054204225540161, 'learning_rate': 9.702006160372209e-05, 'epoch': 1.2}\n",
-      "{'loss': 0.9768, 'grad_norm': 2.08803129196167, 'learning_rate': 9.684088193259355e-05, 'epoch': 1.21}\n",
-      "{'loss': 0.9448, 'grad_norm': 2.4227285385131836, 'learning_rate': 9.665664684045333e-05, 'epoch': 1.23}\n",
-      "{'loss': 1.0078, 'grad_norm': 2.396881103515625, 'learning_rate': 9.646737621134112e-05, 'epoch': 1.25}\n",
-      "{'loss': 0.9285, 'grad_norm': 4.0550384521484375, 'learning_rate': 9.627309047276974e-05, 'epoch': 1.27}\n",
-      "{'loss': 1.0518, 'grad_norm': 3.4381208419799805, 'learning_rate': 9.607381059352038e-05, 'epoch': 1.29}\n",
-      "{'loss': 1.0221, 'grad_norm': 2.341543674468994, 'learning_rate': 9.586955808137958e-05, 'epoch': 1.3}\n",
-      "{'loss': 1.0084, 'grad_norm': 2.660717725753784, 'learning_rate': 9.566035498081784e-05, 'epoch': 1.32}\n",
-      "{'loss': 1.0374, 'grad_norm': 2.4253923892974854, 'learning_rate': 9.544622387061055e-05, 'epoch': 1.34}\n",
-      "{'loss': 0.8872, 'grad_norm': 3.2932205200195312, 'learning_rate': 9.522718786140097e-05, 'epoch': 1.36}\n",
-      "{'loss': 1.0013, 'grad_norm': 3.3068909645080566, 'learning_rate': 9.500327059320606e-05, 'epoch': 1.37}\n",
-      "{'loss': 0.9135, 'grad_norm': 3.9048690795898438, 'learning_rate': 9.477449623286505e-05, 'epoch': 1.39}\n",
-      "{'loss': 0.8808, 'grad_norm': 2.9740893840789795, 'learning_rate': 9.454088947143116e-05, 'epoch': 1.41}\n",
-      "{'loss': 1.0511, 'grad_norm': 3.2612483501434326, 'learning_rate': 9.430247552150673e-05, 'epoch': 1.43}\n",
-      "{'loss': 0.9457, 'grad_norm': 2.8854198455810547, 'learning_rate': 9.405928011452211e-05, 'epoch': 1.45}\n",
-      "{'loss': 0.9401, 'grad_norm': 2.1029069423675537, 'learning_rate': 9.381132949795861e-05, 'epoch': 1.46}\n",
-      "{'loss': 1.0803, 'grad_norm': 3.1445486545562744, 'learning_rate': 9.35586504325155e-05, 'epoch': 1.48}\n",
-      "{'loss': 0.9944, 'grad_norm': 3.0867714881896973, 'learning_rate': 9.330127018922194e-05, 'epoch': 1.5}\n",
-      "{'loss': 0.8916, 'grad_norm': 2.589761257171631, 'learning_rate': 9.303921654649362e-05, 'epoch': 1.52}\n",
-      "{'loss': 0.988, 'grad_norm': 2.9633171558380127, 'learning_rate': 9.277251778713474e-05, 'epoch': 1.54}\n",
-      "{'loss': 0.9376, 'grad_norm': 3.082129716873169, 'learning_rate': 9.250120269528546e-05, 'epoch': 1.55}\n",
-      "{'loss': 0.9333, 'grad_norm': 2.1602373123168945, 'learning_rate': 9.22253005533154e-05, 'epoch': 1.57}\n",
-      "{'loss': 1.0027, 'grad_norm': 2.900174617767334, 'learning_rate': 9.194484113866313e-05, 'epoch': 1.59}\n",
-      "{'loss': 1.1305, 'grad_norm': 3.4030845165252686, 'learning_rate': 9.165985472062246e-05, 'epoch': 1.61}\n",
-      "{'loss': 0.8973, 'grad_norm': 2.5629944801330566, 'learning_rate': 9.137037205707552e-05, 'epoch': 1.62}\n",
-      "{'loss': 0.9483, 'grad_norm': 3.2390940189361572, 'learning_rate': 9.107642439117321e-05, 'epoch': 1.64}\n",
-      "{'loss': 0.9879, 'grad_norm': 2.5794193744659424, 'learning_rate': 9.077804344796302e-05, 'epoch': 1.66}\n",
-      "{'loss': 0.9668, 'grad_norm': 2.389864444732666, 'learning_rate': 9.04752614309652e-05, 'epoch': 1.68}\n",
-      "{'loss': 0.8852, 'grad_norm': 3.5650432109832764, 'learning_rate': 9.01681110186971e-05, 'epoch': 1.7}\n",
-      "{'loss': 0.9984, 'grad_norm': 3.166510581970215, 'learning_rate': 8.985662536114613e-05, 'epoch': 1.71}\n",
-      "{'loss': 0.9526, 'grad_norm': 2.176490306854248, 'learning_rate': 8.954083807619208e-05, 'epoch': 1.73}\n",
-      "{'loss': 1.0377, 'grad_norm': 3.0157470703125, 'learning_rate': 8.922078324597879e-05, 'epoch': 1.75}\n",
-      "{'loss': 1.1106, 'grad_norm': 2.781142234802246, 'learning_rate': 8.889649541323574e-05, 'epoch': 1.77}\n",
-      "{'loss': 1.0373, 'grad_norm': 3.456441879272461, 'learning_rate': 8.856800957755e-05, 'epoch': 1.78}\n",
-      "{'loss': 1.0307, 'grad_norm': 3.646578311920166, 'learning_rate': 8.823536119158864e-05, 'epoch': 1.8}\n",
-      "{'loss': 0.9769, 'grad_norm': 3.4664463996887207, 'learning_rate': 8.789858615727265e-05, 'epoch': 1.82}\n",
-      "{'loss': 0.9524, 'grad_norm': 2.52860951423645, 'learning_rate': 8.755772082190194e-05, 'epoch': 1.84}\n",
-      "{'loss': 1.0686, 'grad_norm': 3.0946435928344727, 'learning_rate': 8.721280197423258e-05, 'epoch': 1.86}\n",
-      "{'loss': 0.9359, 'grad_norm': 3.146989583969116, 'learning_rate': 8.68638668405062e-05, 'epoch': 1.87}\n",
-      "{'loss': 1.0035, 'grad_norm': 3.2309892177581787, 'learning_rate': 8.651095308043232e-05, 'epoch': 1.89}\n",
-      "{'loss': 1.0669, 'grad_norm': 3.8748905658721924, 'learning_rate': 8.61540987831238e-05, 'epoch': 1.91}\n",
-      "{'loss': 1.0676, 'grad_norm': 3.329939603805542, 'learning_rate': 8.579334246298593e-05, 'epoch': 1.93}\n",
-      "{'loss': 0.9976, 'grad_norm': 3.7491514682769775, 'learning_rate': 8.542872305555978e-05, 'epoch': 1.95}\n",
-      "{'loss': 0.9471, 'grad_norm': 3.245119571685791, 'learning_rate': 8.50602799133199e-05, 'epoch': 1.96}\n",
-      "{'loss': 0.9998, 'grad_norm': 2.7840590476989746, 'learning_rate': 8.468805280142709e-05, 'epoch': 1.98}\n",
-      "{'loss': 1.0361, 'grad_norm': 3.2855234146118164, 'learning_rate': 8.43120818934367e-05, 'epoch': 2.0}\n",
-      " 33%|███████████▋                       | 1120/3360 [2:09:53<4:22:41,  7.04s/it][INFO|trainer.py:3788] 2024-07-04 17:36:15,576 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 17:36:15,578 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 17:36:15,580 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  4%|█▉                                          | 2/46 [00:00<00:13,  3.20it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:14,  3.05it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:01<00:10,  3.85it/s]\u001b[A\n",
-      " 11%|████▊                                       | 5/46 [00:01<00:08,  4.72it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:01<00:07,  5.37it/s]\u001b[A\n",
-      " 15%|██████▋                                     | 7/46 [00:01<00:06,  5.88it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:01<00:05,  6.41it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:01<00:05,  6.69it/s]\u001b[A\n",
-      " 22%|█████████▎                                 | 10/46 [00:01<00:05,  7.00it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:02<00:05,  6.87it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:02<00:05,  6.23it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:02<00:05,  5.53it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:02<00:07,  4.35it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:03<00:14,  2.10it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:04<00:19,  1.57it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:05<00:15,  1.83it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:05<00:13,  2.15it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:05<00:10,  2.51it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:05<00:08,  2.91it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:06<00:07,  3.49it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:06<00:05,  4.10it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:06<00:04,  4.73it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:06<00:04,  5.36it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:06<00:03,  6.02it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:06<00:03,  6.66it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:06<00:02,  7.06it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:07<00:02,  8.06it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:07<00:01,  8.26it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:07<00:01,  8.38it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:07<00:01,  8.30it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:07<00:01,  7.96it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:07<00:01,  7.71it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:07<00:01,  7.32it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:08<00:01,  5.57it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:10<00:06,  1.36it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:10<00:05,  1.55it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:10<00:03,  1.97it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:10<00:02,  2.51it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:11<00:01,  3.11it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:11<00:01,  3.75it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:11<00:00,  4.32it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:11<00:00,  4.38it/s]\u001b[A\n",
-      " 98%|██████████████████████████████████████████ | 45/46 [00:11<00:00,  3.93it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 1.6183497905731201, 'eval_runtime': 14.0479, 'eval_samples_per_second': 3.275, 'eval_steps_per_second': 3.275, 'epoch': 2.0}\n",
-      " 33%|███████████▋                       | 1120/3360 [2:10:07<4:22:41,  7.04s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:13<00:00,  1.67it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 17:36:29,696 >> Saving model checkpoint to saves/qwen2-7b/lora/sft/checkpoint-1120\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 17:36:31,166 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 17:36:31,166 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-7B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 3584,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 18944,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 28,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 4,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 131072,\n",
-      "  \"tie_word_embeddings\": false,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 152064\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 17:36:31,345 >> tokenizer config file saved in saves/qwen2-7b/lora/sft/checkpoint-1120/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 17:36:31,345 >> Special tokens file saved in saves/qwen2-7b/lora/sft/checkpoint-1120/special_tokens_map.json\n",
-      "{'loss': 0.4204, 'grad_norm': 3.7729196548461914, 'learning_rate': 8.393240776696274e-05, 'epoch': 2.02}\n",
-      "{'loss': 0.3656, 'grad_norm': 2.241997718811035, 'learning_rate': 8.354907139929851e-05, 'epoch': 2.03}\n",
-      "{'loss': 0.3554, 'grad_norm': 3.0927772521972656, 'learning_rate': 8.316211416299397e-05, 'epoch': 2.05}\n",
-      "{'loss': 0.3576, 'grad_norm': 4.350724697113037, 'learning_rate': 8.27715778213905e-05, 'epoch': 2.07}\n",
-      "{'loss': 0.2409, 'grad_norm': 2.0694334506988525, 'learning_rate': 8.237750452411353e-05, 'epoch': 2.09}\n",
-      "{'loss': 0.3769, 'grad_norm': 2.3954668045043945, 'learning_rate': 8.197993680252334e-05, 'epoch': 2.11}\n",
-      "{'loss': 0.4252, 'grad_norm': 2.997573137283325, 'learning_rate': 8.157891756512488e-05, 'epoch': 2.12}\n",
-      "{'loss': 0.3827, 'grad_norm': 3.1807985305786133, 'learning_rate': 8.117449009293668e-05, 'epoch': 2.14}\n",
-      "{'loss': 0.3291, 'grad_norm': 4.341946125030518, 'learning_rate': 8.076669803481965e-05, 'epoch': 2.16}\n",
-      "{'loss': 0.3049, 'grad_norm': 4.446887493133545, 'learning_rate': 8.035558540276618e-05, 'epoch': 2.18}\n",
-      "{'loss': 0.3638, 'grad_norm': 2.7504091262817383, 'learning_rate': 7.994119656715002e-05, 'epoch': 2.2}\n",
-      "{'loss': 0.4044, 'grad_norm': 2.769212484359741, 'learning_rate': 7.952357625193749e-05, 'epoch': 2.21}\n",
-      "{'loss': 0.3703, 'grad_norm': 5.1489362716674805, 'learning_rate': 7.91027695298606e-05, 'epoch': 2.23}\n",
-      "{'loss': 0.3866, 'grad_norm': 4.870989799499512, 'learning_rate': 7.86788218175523e-05, 'epoch': 2.25}\n",
-      "{'loss': 0.383, 'grad_norm': 2.3800389766693115, 'learning_rate': 7.8251778870645e-05, 'epoch': 2.27}\n",
-      "{'loss': 0.3855, 'grad_norm': 3.800349473953247, 'learning_rate': 7.782168677883206e-05, 'epoch': 2.28}\n",
-      "{'loss': 0.4051, 'grad_norm': 2.723214864730835, 'learning_rate': 7.738859196089358e-05, 'epoch': 2.3}\n",
-      "{'loss': 0.4282, 'grad_norm': 3.5306265354156494, 'learning_rate': 7.695254115968648e-05, 'epoch': 2.32}\n",
-      "{'loss': 0.4128, 'grad_norm': 2.6264665126800537, 'learning_rate': 7.651358143709972e-05, 'epoch': 2.34}\n",
-      "{'loss': 0.4174, 'grad_norm': 3.427201747894287, 'learning_rate': 7.60717601689749e-05, 'epoch': 2.36}\n",
-      "{'loss': 0.3553, 'grad_norm': 3.8674330711364746, 'learning_rate': 7.562712503999327e-05, 'epoch': 2.37}\n",
-      "{'loss': 0.4509, 'grad_norm': 3.253030776977539, 'learning_rate': 7.517972403852905e-05, 'epoch': 2.39}\n",
-      "{'loss': 0.3599, 'grad_norm': 3.4824795722961426, 'learning_rate': 7.472960545147038e-05, 'epoch': 2.41}\n",
-      "{'loss': 0.3248, 'grad_norm': 4.311473369598389, 'learning_rate': 7.427681785900761e-05, 'epoch': 2.43}\n",
-      "{'loss': 0.3835, 'grad_norm': 3.2026665210723877, 'learning_rate': 7.382141012939034e-05, 'epoch': 2.45}\n",
-      "{'loss': 0.3631, 'grad_norm': 3.4886059761047363, 'learning_rate': 7.33634314136531e-05, 'epoch': 2.46}\n",
-      "{'loss': 0.4001, 'grad_norm': 2.1931118965148926, 'learning_rate': 7.290293114031061e-05, 'epoch': 2.48}\n",
-      "{'loss': 0.4094, 'grad_norm': 3.082930564880371, 'learning_rate': 7.243995901002312e-05, 'epoch': 2.5}\n",
-      "{'loss': 0.3916, 'grad_norm': 2.8144562244415283, 'learning_rate': 7.197456499023225e-05, 'epoch': 2.52}\n",
-      "{'loss': 0.4212, 'grad_norm': 3.546799898147583, 'learning_rate': 7.150679930976825e-05, 'epoch': 2.53}\n",
-      "{'loss': 0.3852, 'grad_norm': 3.623589038848877, 'learning_rate': 7.103671245342887e-05, 'epoch': 2.55}\n",
-      "{'loss': 0.3294, 'grad_norm': 3.896050214767456, 'learning_rate': 7.056435515653059e-05, 'epoch': 2.57}\n",
-      "{'loss': 0.4378, 'grad_norm': 2.8549437522888184, 'learning_rate': 7.008977839943299e-05, 'epoch': 2.59}\n",
-      "{'loss': 0.3744, 'grad_norm': 2.963679313659668, 'learning_rate': 6.961303340203653e-05, 'epoch': 2.61}\n",
-      "{'loss': 0.4083, 'grad_norm': 3.584379196166992, 'learning_rate': 6.91341716182545e-05, 'epoch': 2.62}\n",
-      "{'loss': 0.3875, 'grad_norm': 3.231067180633545, 'learning_rate': 6.86532447304597e-05, 'epoch': 2.64}\n",
-      "{'loss': 0.3555, 'grad_norm': 3.2355687618255615, 'learning_rate': 6.817030464390656e-05, 'epoch': 2.66}\n",
-      "{'loss': 0.3962, 'grad_norm': 4.36820125579834, 'learning_rate': 6.768540348112907e-05, 'epoch': 2.68}\n",
-      "{'loss': 0.3224, 'grad_norm': 2.6882545948028564, 'learning_rate': 6.719859357631535e-05, 'epoch': 2.7}\n",
-      "{'loss': 0.3478, 'grad_norm': 3.5584182739257812, 'learning_rate': 6.670992746965938e-05, 'epoch': 2.71}\n",
-      "{'loss': 0.4298, 'grad_norm': 4.19834041595459, 'learning_rate': 6.621945790169036e-05, 'epoch': 2.73}\n",
-      "{'loss': 0.4304, 'grad_norm': 4.770883083343506, 'learning_rate': 6.572723780758069e-05, 'epoch': 2.75}\n",
-      "{'loss': 0.3657, 'grad_norm': 4.010149955749512, 'learning_rate': 6.523332031143272e-05, 'epoch': 2.77}\n",
-      "{'loss': 0.3699, 'grad_norm': 3.2105469703674316, 'learning_rate': 6.473775872054521e-05, 'epoch': 2.78}\n",
-      "{'loss': 0.3342, 'grad_norm': 3.494490146636963, 'learning_rate': 6.424060651966007e-05, 'epoch': 2.8}\n",
-      "{'loss': 0.327, 'grad_norm': 3.291541814804077, 'learning_rate': 6.374191736518974e-05, 'epoch': 2.82}\n",
-      "{'loss': 0.3928, 'grad_norm': 3.125520706176758, 'learning_rate': 6.324174507942637e-05, 'epoch': 2.84}\n",
-      "{'loss': 0.3776, 'grad_norm': 4.660810470581055, 'learning_rate': 6.274014364473274e-05, 'epoch': 2.86}\n",
-      "{'loss': 0.4623, 'grad_norm': 2.8751118183135986, 'learning_rate': 6.22371671977162e-05, 'epoch': 2.87}\n",
-      "{'loss': 0.4122, 'grad_norm': 4.0637078285217285, 'learning_rate': 6.173287002338577e-05, 'epoch': 2.89}\n",
-      "{'loss': 0.4056, 'grad_norm': 3.7399301528930664, 'learning_rate': 6.122730654929334e-05, 'epoch': 2.91}\n",
-      "{'loss': 0.3351, 'grad_norm': 4.581759452819824, 'learning_rate': 6.072053133965938e-05, 'epoch': 2.93}\n",
-      "{'loss': 0.3849, 'grad_norm': 3.381431818008423, 'learning_rate': 6.021259908948402e-05, 'epoch': 2.95}\n",
-      "{'loss': 0.3947, 'grad_norm': 4.740965366363525, 'learning_rate': 5.970356461864391e-05, 'epoch': 2.96}\n",
-      "{'loss': 0.3945, 'grad_norm': 5.124401569366455, 'learning_rate': 5.919348286597569e-05, 'epoch': 2.98}\n",
-      "{'loss': 0.4098, 'grad_norm': 3.3869075775146484, 'learning_rate': 5.868240888334653e-05, 'epoch': 3.0}\n",
-      " 50%|█████████████████▌                 | 1680/3360 [3:13:07<3:03:37,  6.56s/it][INFO|trainer.py:3788] 2024-07-04 18:39:30,098 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 18:39:30,098 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 18:39:30,098 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  4%|█▉                                          | 2/46 [00:00<00:03, 13.92it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:00<00:05,  7.78it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:06,  6.54it/s]\u001b[A\n",
-      " 15%|██████▋                                     | 7/46 [00:01<00:06,  5.99it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:01<00:07,  4.94it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:01<00:10,  3.63it/s]\u001b[A\n",
-      " 22%|█████████▎                                 | 10/46 [00:03<00:26,  1.36it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:04<00:26,  1.34it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:04<00:20,  1.62it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:04<00:16,  2.03it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:05<00:13,  2.46it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:05<00:10,  2.89it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:05<00:09,  3.32it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:05<00:07,  3.74it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:05<00:06,  4.13it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:06<00:06,  4.19it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:06<00:06,  4.31it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:06<00:05,  4.72it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:06<00:04,  5.04it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:06<00:04,  5.33it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:06<00:04,  5.49it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:07<00:03,  5.80it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:07<00:03,  5.90it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:07<00:03,  5.85it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:07<00:03,  5.65it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:07<00:03,  5.47it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:08<00:03,  4.67it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:09<00:09,  1.53it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:10<00:07,  1.81it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:10<00:06,  2.16it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:10<00:04,  2.48it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:10<00:04,  2.72it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:11<00:03,  2.82it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:12<00:06,  1.41it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:13<00:05,  1.49it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:13<00:03,  1.92it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:13<00:02,  2.46it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:13<00:01,  3.12it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:13<00:01,  3.85it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:14<00:00,  4.55it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:14<00:00,  5.25it/s]\u001b[A\n",
-      " 98%|██████████████████████████████████████████ | 45/46 [00:14<00:00,  5.92it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.002082109451294, 'eval_runtime': 14.5597, 'eval_samples_per_second': 3.159, 'eval_steps_per_second': 3.159, 'epoch': 3.0}\n",
-      " 50%|█████████████████▌                 | 1680/3360 [3:13:22<3:03:37,  6.56s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:14<00:00,  6.39it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 18:39:44,661 >> Saving model checkpoint to saves/qwen2-7b/lora/sft/checkpoint-1680\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 18:39:46,491 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 18:39:46,492 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-7B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 3584,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 18944,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 28,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 4,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 131072,\n",
-      "  \"tie_word_embeddings\": false,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 152064\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 18:39:46,762 >> tokenizer config file saved in saves/qwen2-7b/lora/sft/checkpoint-1680/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 18:39:46,762 >> Special tokens file saved in saves/qwen2-7b/lora/sft/checkpoint-1680/special_tokens_map.json\n",
-      "{'loss': 0.1954, 'grad_norm': 2.6332297325134277, 'learning_rate': 5.8170397829712485e-05, 'epoch': 3.02}\n",
-      "{'loss': 0.0883, 'grad_norm': 3.9817214012145996, 'learning_rate': 5.765750496516547e-05, 'epoch': 3.03}\n",
-      "{'loss': 0.1392, 'grad_norm': 1.9517065286636353, 'learning_rate': 5.714378564496901e-05, 'epoch': 3.05}\n",
-      "{'loss': 0.1231, 'grad_norm': 1.8976528644561768, 'learning_rate': 5.6629295313583974e-05, 'epoch': 3.07}\n",
-      "{'loss': 0.1299, 'grad_norm': 1.4403581619262695, 'learning_rate': 5.611408949868457e-05, 'epoch': 3.09}\n",
-      "{'loss': 0.1068, 'grad_norm': 3.757260322570801, 'learning_rate': 5.559822380516539e-05, 'epoch': 3.11}\n",
-      "{'loss': 0.0975, 'grad_norm': 1.9946837425231934, 'learning_rate': 5.5081753909140096e-05, 'epoch': 3.12}\n",
-      "{'loss': 0.1205, 'grad_norm': 3.075326442718506, 'learning_rate': 5.456473555193242e-05, 'epoch': 3.14}\n",
-      "{'loss': 0.1226, 'grad_norm': 2.4876396656036377, 'learning_rate': 5.404722453406017e-05, 'epoch': 3.16}\n",
-      "{'loss': 0.1144, 'grad_norm': 2.430744171142578, 'learning_rate': 5.3529276709212816e-05, 'epoch': 3.18}\n",
-      "{'loss': 0.1399, 'grad_norm': 3.6195318698883057, 'learning_rate': 5.30109479782233e-05, 'epoch': 3.2}\n",
-      "{'loss': 0.1156, 'grad_norm': 3.914135217666626, 'learning_rate': 5.249229428303486e-05, 'epoch': 3.21}\n",
-      "{'loss': 0.1372, 'grad_norm': 1.994607925415039, 'learning_rate': 5.197337160066331e-05, 'epoch': 3.23}\n",
-      "{'loss': 0.1138, 'grad_norm': 1.6210600137710571, 'learning_rate': 5.145423593715557e-05, 'epoch': 3.25}\n",
-      "{'loss': 0.14, 'grad_norm': 2.50508713722229, 'learning_rate': 5.0934943321545115e-05, 'epoch': 3.27}\n",
-      "{'loss': 0.1152, 'grad_norm': 4.362739562988281, 'learning_rate': 5.041554979980486e-05, 'epoch': 3.28}\n",
-      "{'loss': 0.1549, 'grad_norm': 3.601013422012329, 'learning_rate': 4.9896111428798254e-05, 'epoch': 3.3}\n",
-      "{'loss': 0.1429, 'grad_norm': 2.076098680496216, 'learning_rate': 4.9376684270229254e-05, 'epoch': 3.32}\n",
-      "{'loss': 0.1353, 'grad_norm': 1.633200764656067, 'learning_rate': 4.8857324384591653e-05, 'epoch': 3.34}\n",
-      "{'loss': 0.1284, 'grad_norm': 4.053235054016113, 'learning_rate': 4.8338087825118675e-05, 'epoch': 3.36}\n",
-      "{'loss': 0.1526, 'grad_norm': 2.4892356395721436, 'learning_rate': 4.781903063173321e-05, 'epoch': 3.37}\n",
-      "{'loss': 0.1042, 'grad_norm': 1.8938469886779785, 'learning_rate': 4.730020882499964e-05, 'epoch': 3.39}\n",
-      "{'loss': 0.1569, 'grad_norm': 1.758270502090454, 'learning_rate': 4.678167840007767e-05, 'epoch': 3.41}\n",
-      "{'loss': 0.117, 'grad_norm': 1.9446786642074585, 'learning_rate': 4.626349532067879e-05, 'epoch': 3.43}\n",
-      "{'loss': 0.1603, 'grad_norm': 2.5028741359710693, 'learning_rate': 4.574571551302647e-05, 'epoch': 3.44}\n",
-      "{'loss': 0.1528, 'grad_norm': 3.524077892303467, 'learning_rate': 4.522839485981994e-05, 'epoch': 3.46}\n",
-      "{'loss': 0.1366, 'grad_norm': 2.425860643386841, 'learning_rate': 4.471158919420312e-05, 'epoch': 3.48}\n",
-      "{'loss': 0.1231, 'grad_norm': 2.6059088706970215, 'learning_rate': 4.4195354293738484e-05, 'epoch': 3.5}\n",
-      "{'loss': 0.1479, 'grad_norm': 3.934004783630371, 'learning_rate': 4.367974587438733e-05, 'epoch': 3.52}\n",
-      "{'loss': 0.1466, 'grad_norm': 2.3225414752960205, 'learning_rate': 4.316481958449634e-05, 'epoch': 3.53}\n",
-      "{'loss': 0.1161, 'grad_norm': 3.3421878814697266, 'learning_rate': 4.2650630998791615e-05, 'epoch': 3.55}\n",
-      "{'loss': 0.1312, 'grad_norm': 2.411162853240967, 'learning_rate': 4.213723561238074e-05, 'epoch': 3.57}\n",
-      "{'loss': 0.1144, 'grad_norm': 2.74504017829895, 'learning_rate': 4.162468883476319e-05, 'epoch': 3.59}\n",
-      "{'loss': 0.1303, 'grad_norm': 3.3871073722839355, 'learning_rate': 4.111304598385018e-05, 'epoch': 3.61}\n",
-      "{'loss': 0.1272, 'grad_norm': 2.4120686054229736, 'learning_rate': 4.060236227999441e-05, 'epoch': 3.62}\n",
-      "{'loss': 0.1127, 'grad_norm': 2.2959489822387695, 'learning_rate': 4.0092692840030134e-05, 'epoch': 3.64}\n",
-      "{'loss': 0.131, 'grad_norm': 2.5716683864593506, 'learning_rate': 3.9584092671324606e-05, 'epoch': 3.66}\n",
-      "{'loss': 0.1512, 'grad_norm': 3.035562753677368, 'learning_rate': 3.907661666584131e-05, 'epoch': 3.68}\n",
-      "{'loss': 0.1253, 'grad_norm': 2.897613048553467, 'learning_rate': 3.857031959421553e-05, 'epoch': 3.69}\n",
-      "{'loss': 0.1084, 'grad_norm': 2.2627975940704346, 'learning_rate': 3.806525609984312e-05, 'epoch': 3.71}\n",
-      "{'loss': 0.105, 'grad_norm': 2.2742927074432373, 'learning_rate': 3.7561480692983006e-05, 'epoch': 3.73}\n",
-      "{'loss': 0.1489, 'grad_norm': 1.9651683568954468, 'learning_rate': 3.705904774487396e-05, 'epoch': 3.75}\n",
-      "{'loss': 0.1448, 'grad_norm': 4.107623100280762, 'learning_rate': 3.655801148186655e-05, 'epoch': 3.77}\n",
-      "{'loss': 0.0998, 'grad_norm': 2.270852565765381, 'learning_rate': 3.6058425979570485e-05, 'epoch': 3.78}\n",
-      "{'loss': 0.1176, 'grad_norm': 3.770810842514038, 'learning_rate': 3.556034515701852e-05, 'epoch': 3.8}\n",
-      "{'loss': 0.1175, 'grad_norm': 4.139482498168945, 'learning_rate': 3.506382277084696e-05, 'epoch': 3.82}\n",
-      "{'loss': 0.152, 'grad_norm': 2.7534141540527344, 'learning_rate': 3.4568912409493945e-05, 'epoch': 3.84}\n",
-      "{'loss': 0.0974, 'grad_norm': 2.224083423614502, 'learning_rate': 3.4075667487415785e-05, 'epoch': 3.86}\n",
-      "{'loss': 0.1133, 'grad_norm': 1.7634135484695435, 'learning_rate': 3.358414123932195e-05, 'epoch': 3.87}\n",
-      "{'loss': 0.1311, 'grad_norm': 2.7758963108062744, 'learning_rate': 3.3094386714429724e-05, 'epoch': 3.89}\n",
-      "{'loss': 0.1341, 'grad_norm': 2.842358350753784, 'learning_rate': 3.2606456770738636e-05, 'epoch': 3.91}\n",
-      "{'loss': 0.0884, 'grad_norm': 1.71796452999115, 'learning_rate': 3.212040406932569e-05, 'epoch': 3.93}\n",
-      "{'loss': 0.0956, 'grad_norm': 2.689420461654663, 'learning_rate': 3.163628106866172e-05, 'epoch': 3.94}\n",
-      "{'loss': 0.1731, 'grad_norm': 2.630415439605713, 'learning_rate': 3.115414001894974e-05, 'epoch': 3.96}\n",
-      "{'loss': 0.1458, 'grad_norm': 2.928737163543701, 'learning_rate': 3.067403295648566e-05, 'epoch': 3.98}\n",
-      "{'loss': 0.1278, 'grad_norm': 2.467090129852295, 'learning_rate': 3.019601169804216e-05, 'epoch': 4.0}\n",
-      " 67%|███████████████████████▎           | 2240/3360 [4:14:45<2:03:53,  6.64s/it][INFO|trainer.py:3788] 2024-07-04 19:41:08,043 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 19:41:08,044 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 19:41:08,044 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  4%|█▉                                          | 2/46 [00:00<00:03, 13.68it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:00<00:04,  8.54it/s]\u001b[A\n",
-      " 11%|████▊                                       | 5/46 [00:00<00:05,  8.15it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:05,  7.67it/s]\u001b[A\n",
-      " 15%|██████▋                                     | 7/46 [00:00<00:05,  7.17it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:01<00:05,  6.86it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:01<00:05,  6.53it/s]\u001b[A\n",
-      " 22%|█████████▎                                 | 10/46 [00:01<00:06,  5.50it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:01<00:07,  4.52it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:03<00:18,  1.87it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:04<00:22,  1.44it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:04<00:18,  1.78it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:04<00:13,  2.22it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:04<00:11,  2.72it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:04<00:09,  3.13it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:05<00:07,  3.65it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:05<00:06,  4.08it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:05<00:05,  4.42it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:05<00:05,  4.48it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:05<00:05,  4.72it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:06<00:04,  4.77it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:06<00:05,  4.21it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:06<00:05,  4.13it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:06<00:05,  3.63it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:07<00:06,  3.16it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:09<00:13,  1.34it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:09<00:11,  1.43it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:09<00:08,  1.84it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:10<00:06,  2.28it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:10<00:05,  2.71it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:10<00:04,  3.14it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:10<00:03,  3.74it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:10<00:02,  4.33it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:10<00:02,  4.70it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:11<00:01,  5.04it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:11<00:01,  5.26it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:11<00:01,  5.47it/s]\u001b[A\n",
-      " 87%|██████████████████████████���██████████▍     | 40/46 [00:11<00:01,  5.30it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:11<00:00,  5.17it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:12<00:00,  5.07it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:12<00:00,  5.08it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:12<00:00,  4.41it/s]\u001b[A\n",
-      " 98%|██████████████████████████████████████████ | 45/46 [00:12<00:00,  4.08it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.360382080078125, 'eval_runtime': 13.265, 'eval_samples_per_second': 3.468, 'eval_steps_per_second': 3.468, 'epoch': 4.0}\n",
-      " 67%|███████████████████████▎           | 2240/3360 [4:14:58<2:03:53,  6.64s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:13<00:00,  4.15it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 19:41:21,314 >> Saving model checkpoint to saves/qwen2-7b/lora/sft/checkpoint-2240\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 19:41:22,728 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 19:41:22,729 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-7B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 3584,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 18944,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 28,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 4,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 131072,\n",
-      "  \"tie_word_embeddings\": false,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 152064\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 19:41:22,849 >> tokenizer config file saved in saves/qwen2-7b/lora/sft/checkpoint-2240/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 19:41:22,850 >> Special tokens file saved in saves/qwen2-7b/lora/sft/checkpoint-2240/special_tokens_map.json\n",
-      "{'loss': 0.0535, 'grad_norm': 0.6558727025985718, 'learning_rate': 2.9720127835276256e-05, 'epoch': 4.02}\n",
-      "{'loss': 0.0402, 'grad_norm': 2.1889960765838623, 'learning_rate': 2.9246432729161055e-05, 'epoch': 4.03}\n",
-      "{'loss': 0.0377, 'grad_norm': 0.8100994229316711, 'learning_rate': 2.8774977504442647e-05, 'epoch': 4.05}\n",
-      "{'loss': 0.0398, 'grad_norm': 2.8209896087646484, 'learning_rate': 2.8305813044122097e-05, 'epoch': 4.07}\n",
-      "{'loss': 0.0308, 'grad_norm': 1.5516138076782227, 'learning_rate': 2.7838989983964065e-05, 'epoch': 4.09}\n",
-      "{'loss': 0.0597, 'grad_norm': 4.609562873840332, 'learning_rate': 2.737455870703155e-05, 'epoch': 4.11}\n",
-      "{'loss': 0.0231, 'grad_norm': 2.7549400329589844, 'learning_rate': 2.6912569338248315e-05, 'epoch': 4.12}\n",
-      "{'loss': 0.0448, 'grad_norm': 5.040008068084717, 'learning_rate': 2.645307173898901e-05, 'epoch': 4.14}\n",
-      "{'loss': 0.0253, 'grad_norm': 1.6336179971694946, 'learning_rate': 2.5996115501697694e-05, 'epoch': 4.16}\n",
-      "{'loss': 0.0289, 'grad_norm': 0.8074469566345215, 'learning_rate': 2.5541749944535554e-05, 'epoch': 4.18}\n",
-      "{'loss': 0.0338, 'grad_norm': 1.710808277130127, 'learning_rate': 2.5090024106057962e-05, 'epoch': 4.19}\n",
-      "{'loss': 0.0379, 'grad_norm': 2.1768016815185547, 'learning_rate': 2.464098673992205e-05, 'epoch': 4.21}\n",
-      "{'loss': 0.0682, 'grad_norm': 3.6282131671905518, 'learning_rate': 2.4194686309624663e-05, 'epoch': 4.23}\n",
-      "{'loss': 0.0515, 'grad_norm': 1.100537896156311, 'learning_rate': 2.3751170983272e-05, 'epoch': 4.25}\n",
-      "{'loss': 0.0354, 'grad_norm': 0.6081830859184265, 'learning_rate': 2.3310488628380757e-05, 'epoch': 4.27}\n",
-      "{'loss': 0.0334, 'grad_norm': 1.5605361461639404, 'learning_rate': 2.2872686806712035e-05, 'epoch': 4.28}\n",
-      "{'loss': 0.0492, 'grad_norm': 2.5406620502471924, 'learning_rate': 2.243781276913811e-05, 'epoch': 4.3}\n",
-      "{'loss': 0.0279, 'grad_norm': 2.160897970199585, 'learning_rate': 2.200591345054267e-05, 'epoch': 4.32}\n",
-      "{'loss': 0.0342, 'grad_norm': 2.3391342163085938, 'learning_rate': 2.157703546475539e-05, 'epoch': 4.34}\n",
-      "{'loss': 0.0332, 'grad_norm': 1.3248311281204224, 'learning_rate': 2.115122509952085e-05, 'epoch': 4.36}\n",
-      "{'loss': 0.0334, 'grad_norm': 2.741152763366699, 'learning_rate': 2.0728528311502976e-05, 'epoch': 4.37}\n",
-      "{'loss': 0.0542, 'grad_norm': 2.237809419631958, 'learning_rate': 2.0308990721324927e-05, 'epoch': 4.39}\n",
-      "{'loss': 0.0344, 'grad_norm': 3.8997409343719482, 'learning_rate': 1.989265760864542e-05, 'epoch': 4.41}\n",
-      "{'loss': 0.0439, 'grad_norm': 0.6022194623947144, 'learning_rate': 1.947957390727185e-05, 'epoch': 4.43}\n",
-      "{'loss': 0.0346, 'grad_norm': 1.2296243906021118, 'learning_rate': 1.906978420031059e-05, 'epoch': 4.44}\n",
-      "{'loss': 0.0209, 'grad_norm': 0.28131213784217834, 'learning_rate': 1.8663332715355396e-05, 'epoch': 4.46}\n",
-      "{'loss': 0.0271, 'grad_norm': 2.75640606880188, 'learning_rate': 1.8260263319713844e-05, 'epoch': 4.48}\n",
-      "{'loss': 0.0408, 'grad_norm': 3.289303779602051, 'learning_rate': 1.7860619515673033e-05, 'epoch': 4.5}\n",
-      "{'loss': 0.0344, 'grad_norm': 1.2157098054885864, 'learning_rate': 1.746444443580433e-05, 'epoch': 4.52}\n",
-      "{'loss': 0.0272, 'grad_norm': 1.5058122873306274, 'learning_rate': 1.7071780838308288e-05, 'epoch': 4.53}\n",
-      "{'loss': 0.0283, 'grad_norm': 1.8522496223449707, 'learning_rate': 1.6682671102399805e-05, 'epoch': 4.55}\n",
-      "{'loss': 0.027, 'grad_norm': 2.126176595687866, 'learning_rate': 1.629715722373423e-05, 'epoch': 4.57}\n",
-      "{'loss': 0.0434, 'grad_norm': 2.065514326095581, 'learning_rate': 1.5915280809874932e-05, 'epoch': 4.59}\n",
-      "{'loss': 0.0427, 'grad_norm': 2.2047812938690186, 'learning_rate': 1.553708307580265e-05, 'epoch': 4.61}\n",
-      "{'loss': 0.0266, 'grad_norm': 2.1723501682281494, 'learning_rate': 1.5162604839467265e-05, 'epoch': 4.62}\n",
-      "{'loss': 0.0201, 'grad_norm': 1.7166253328323364, 'learning_rate': 1.4791886517382413e-05, 'epoch': 4.64}\n",
-      "{'loss': 0.0306, 'grad_norm': 0.5556966066360474, 'learning_rate': 1.4424968120263504e-05, 'epoch': 4.66}\n",
-      "{'loss': 0.0249, 'grad_norm': 1.101198434829712, 'learning_rate': 1.4061889248709343e-05, 'epoch': 4.68}\n",
-      "{'loss': 0.0324, 'grad_norm': 0.6396570801734924, 'learning_rate': 1.370268908892825e-05, 'epoch': 4.69}\n",
-      "{'loss': 0.0303, 'grad_norm': 2.5093636512756348, 'learning_rate': 1.3347406408508695e-05, 'epoch': 4.71}\n",
-      "{'loss': 0.0522, 'grad_norm': 1.5739742517471313, 'learning_rate': 1.2996079552235263e-05, 'epoch': 4.73}\n",
-      "{'loss': 0.0293, 'grad_norm': 0.9539183974266052, 'learning_rate': 1.264874643795021e-05, 'epoch': 4.75}\n",
-      "{'loss': 0.0289, 'grad_norm': 0.5063753724098206, 'learning_rate': 1.230544455246101e-05, 'epoch': 4.77}\n",
-      "{'loss': 0.0457, 'grad_norm': 1.6972631216049194, 'learning_rate': 1.1966210947494583e-05, 'epoch': 4.78}\n",
-      "{'loss': 0.0228, 'grad_norm': 0.8949175477027893, 'learning_rate': 1.1631082235698316e-05, 'epoch': 4.8}\n",
-      "{'loss': 0.0345, 'grad_norm': 1.8337916135787964, 'learning_rate': 1.130009458668863e-05, 'epoch': 4.82}\n",
-      "{'loss': 0.0221, 'grad_norm': 2.356985569000244, 'learning_rate': 1.097328372314721e-05, 'epoch': 4.84}\n",
-      "{'loss': 0.0328, 'grad_norm': 2.9775609970092773, 'learning_rate': 1.0650684916965559e-05, 'epoch': 4.85}\n",
-      "{'loss': 0.0298, 'grad_norm': 2.2749829292297363, 'learning_rate': 1.0332332985438248e-05, 'epoch': 4.87}\n",
-      "{'loss': 0.0411, 'grad_norm': 1.9781012535095215, 'learning_rate': 1.0018262287505086e-05, 'epoch': 4.89}\n",
-      "{'loss': 0.0461, 'grad_norm': 1.8106870651245117, 'learning_rate': 9.708506720042932e-06, 'epoch': 4.91}\n",
-      "{'loss': 0.0354, 'grad_norm': 1.3991378545761108, 'learning_rate': 9.403099714207175e-06, 'epoch': 4.93}\n",
-      "{'loss': 0.0269, 'grad_norm': 0.6455625891685486, 'learning_rate': 9.102074231823727e-06, 'epoch': 4.94}\n",
-      "{'loss': 0.0339, 'grad_norm': 1.2710880041122437, 'learning_rate': 8.805462761831418e-06, 'epoch': 4.96}\n",
-      "{'loss': 0.0334, 'grad_norm': 1.1816545724868774, 'learning_rate': 8.513297316775625e-06, 'epoch': 4.98}\n",
-      "{'loss': 0.0301, 'grad_norm': 1.668415904045105, 'learning_rate': 8.225609429353187e-06, 'epoch': 5.0}\n",
-      " 83%|█████████████████████████████▏     | 2800/3360 [5:16:56<1:03:45,  6.83s/it][INFO|trainer.py:3788] 2024-07-04 20:43:18,672 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 20:43:18,672 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 20:43:18,673 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  4%|█▉                                          | 2/46 [00:00<00:03, 13.51it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:00<00:04,  9.12it/s]\u001b[A\n",
-      " 11%|████▊                                       | 5/46 [00:00<00:04,  8.61it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:04,  8.44it/s]\u001b[A\n",
-      " 15%|██████▋                                     | 7/46 [00:00<00:04,  8.01it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:00<00:04,  7.64it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:01<00:05,  7.15it/s]\u001b[A\n",
-      " 22%|█████████▎                                 | 10/46 [00:01<00:05,  6.01it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:01<00:07,  4.84it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:01<00:07,  4.34it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:02<00:07,  4.27it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:02<00:07,  4.39it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:02<00:07,  4.38it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:02<00:06,  4.41it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:03<00:06,  4.23it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:03<00:06,  4.24it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:03<00:06,  4.49it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:03<00:05,  4.73it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:03<00:05,  4.44it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:04<00:09,  2.42it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:05<00:14,  1.60it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:06<00:10,  2.01it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:06<00:08,  2.56it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:06<00:06,  3.08it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:06<00:05,  3.68it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:06<00:04,  4.17it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:06<00:03,  4.56it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:07<00:03,  4.55it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:07<00:03,  4.33it/s]\u001b[A\n",
-      " 70%|██████████████████████████���██▉             | 32/46 [00:07<00:03,  3.76it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:08<00:04,  2.88it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:08<00:04,  2.61it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:09<00:05,  2.11it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:09<00:04,  2.31it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:10<00:03,  2.61it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:10<00:02,  2.80it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:10<00:02,  2.93it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:10<00:01,  3.12it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:11<00:01,  3.19it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:11<00:01,  3.09it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:11<00:00,  3.34it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:11<00:00,  3.84it/s]\u001b[A\n",
-      " 98%|██████████████████████████████████████████ | 45/46 [00:12<00:00,  4.50it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.715369939804077, 'eval_runtime': 12.5246, 'eval_samples_per_second': 3.673, 'eval_steps_per_second': 3.673, 'epoch': 5.0}\n",
-      " 83%|█████████████████████████████▏     | 2800/3360 [5:17:08<1:03:45,  6.83s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:12<00:00,  5.17it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 20:43:31,199 >> Saving model checkpoint to saves/qwen2-7b/lora/sft/checkpoint-2800\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 20:43:32,430 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 20:43:32,431 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-7B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 3584,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 18944,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 28,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 4,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 131072,\n",
-      "  \"tie_word_embeddings\": false,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 152064\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 20:43:32,551 >> tokenizer config file saved in saves/qwen2-7b/lora/sft/checkpoint-2800/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 20:43:32,551 >> Special tokens file saved in saves/qwen2-7b/lora/sft/checkpoint-2800/special_tokens_map.json\n",
-      "{'loss': 0.0078, 'grad_norm': 1.6351463794708252, 'learning_rate': 7.942430149009161e-06, 'epoch': 5.02}\n",
-      "{'loss': 0.0062, 'grad_norm': 0.11965573579072952, 'learning_rate': 7.663790038585793e-06, 'epoch': 5.03}\n",
-      "{'loss': 0.0105, 'grad_norm': 0.05803072825074196, 'learning_rate': 7.389719171023857e-06, 'epoch': 5.05}\n",
-      "{'loss': 0.0058, 'grad_norm': 0.143271803855896, 'learning_rate': 7.1202471261170245e-06, 'epoch': 5.07}\n",
-      "{'loss': 0.0035, 'grad_norm': 0.17391343414783478, 'learning_rate': 6.855402987319348e-06, 'epoch': 5.09}\n",
-      "{'loss': 0.0072, 'grad_norm': 0.19679808616638184, 'learning_rate': 6.595215338606397e-06, 'epoch': 5.1}\n",
-      "{'loss': 0.0123, 'grad_norm': 0.09687481820583344, 'learning_rate': 6.339712261390213e-06, 'epoch': 5.12}\n",
-      "{'loss': 0.0057, 'grad_norm': 0.069660983979702, 'learning_rate': 6.088921331488568e-06, 'epoch': 5.14}\n",
-      "{'loss': 0.0072, 'grad_norm': 1.3626017570495605, 'learning_rate': 5.8428696161488215e-06, 'epoch': 5.16}\n",
-      "{'loss': 0.0047, 'grad_norm': 2.0419363975524902, 'learning_rate': 5.601583671126531e-06, 'epoch': 5.18}\n",
-      "{'loss': 0.0097, 'grad_norm': 0.2337513566017151, 'learning_rate': 5.365089537819434e-06, 'epoch': 5.19}\n",
-      "{'loss': 0.0042, 'grad_norm': 0.05815720558166504, 'learning_rate': 5.133412740456806e-06, 'epoch': 5.21}\n",
-      "{'loss': 0.008, 'grad_norm': 1.3515617847442627, 'learning_rate': 4.906578283344759e-06, 'epoch': 5.23}\n",
-      "{'loss': 0.0087, 'grad_norm': 0.37659117579460144, 'learning_rate': 4.684610648167503e-06, 'epoch': 5.25}\n",
-      "{'loss': 0.0031, 'grad_norm': 0.33385252952575684, 'learning_rate': 4.467533791345191e-06, 'epoch': 5.27}\n",
-      "{'loss': 0.0067, 'grad_norm': 0.15747712552547455, 'learning_rate': 4.255371141448272e-06, 'epoch': 5.28}\n",
-      "{'loss': 0.007, 'grad_norm': 1.2530337572097778, 'learning_rate': 4.048145596668967e-06, 'epoch': 5.3}\n",
-      "{'loss': 0.0136, 'grad_norm': 2.182263135910034, 'learning_rate': 3.84587952234991e-06, 'epoch': 5.32}\n",
-      "{'loss': 0.0035, 'grad_norm': 1.1545133590698242, 'learning_rate': 3.6485947485702832e-06, 'epoch': 5.34}\n",
-      "{'loss': 0.0061, 'grad_norm': 0.33282843232154846, 'learning_rate': 3.4563125677897932e-06, 'epoch': 5.35}\n",
-      "{'loss': 0.004, 'grad_norm': 0.2662621736526489, 'learning_rate': 3.269053732550581e-06, 'epoch': 5.37}\n",
-      "{'loss': 0.0071, 'grad_norm': 1.1687767505645752, 'learning_rate': 3.086838453237506e-06, 'epoch': 5.39}\n",
-      "{'loss': 0.0082, 'grad_norm': 0.12040398269891739, 'learning_rate': 2.9096863958968268e-06, 'epoch': 5.41}\n",
-      "{'loss': 0.0042, 'grad_norm': 0.22544123232364655, 'learning_rate': 2.737616680113758e-06, 'epoch': 5.43}\n",
-      "{'loss': 0.0056, 'grad_norm': 0.3548804521560669, 'learning_rate': 2.570647876948895e-06, 'epoch': 5.44}\n",
-      "{'loss': 0.0133, 'grad_norm': 0.7295147180557251, 'learning_rate': 2.408798006933882e-06, 'epoch': 5.46}\n",
-      "{'loss': 0.0125, 'grad_norm': 0.05939454585313797, 'learning_rate': 2.252084538126542e-06, 'epoch': 5.48}\n",
-      "{'loss': 0.0064, 'grad_norm': 0.5182624459266663, 'learning_rate': 2.100524384225555e-06, 'epoch': 5.5}\n",
-      "{'loss': 0.0043, 'grad_norm': 0.13460208475589752, 'learning_rate': 1.9541339027450256e-06, 'epoch': 5.52}\n",
-      "{'loss': 0.0066, 'grad_norm': 0.8837604522705078, 'learning_rate': 1.8129288932490274e-06, 'epoch': 5.53}\n",
-      "{'loss': 0.0092, 'grad_norm': 0.332492858171463, 'learning_rate': 1.6769245956464396e-06, 'epoch': 5.55}\n",
-      "{'loss': 0.0048, 'grad_norm': 0.2933903634548187, 'learning_rate': 1.5461356885461075e-06, 'epoch': 5.57}\n",
-      "{'loss': 0.0054, 'grad_norm': 0.371267706155777, 'learning_rate': 1.4205762876726092e-06, 'epoch': 5.59}\n",
-      "{'loss': 0.0083, 'grad_norm': 0.14521144330501556, 'learning_rate': 1.3002599443428243e-06, 'epoch': 5.6}\n",
-      "{'loss': 0.0073, 'grad_norm': 1.345499038696289, 'learning_rate': 1.1851996440033319e-06, 'epoch': 5.62}\n",
-      "{'loss': 0.0064, 'grad_norm': 0.025303443893790245, 'learning_rate': 1.0754078048289374e-06, 'epoch': 5.64}\n",
-      "{'loss': 0.0049, 'grad_norm': 1.9373172521591187, 'learning_rate': 9.708962763824048e-07, 'epoch': 5.66}\n",
-      "{'loss': 0.0063, 'grad_norm': 0.6459546685218811, 'learning_rate': 8.716763383355864e-07, 'epoch': 5.68}\n",
-      "{'loss': 0.005, 'grad_norm': 1.4349000453948975, 'learning_rate': 7.777586992519959e-07, 'epoch': 5.69}\n",
-      "{'loss': 0.0103, 'grad_norm': 0.5553787350654602, 'learning_rate': 6.891534954310885e-07, 'epoch': 5.71}\n",
-      "{'loss': 0.0054, 'grad_norm': 0.19051159918308258, 'learning_rate': 6.058702898142643e-07, 'epoch': 5.73}\n",
-      "{'loss': 0.0059, 'grad_norm': 0.36273324489593506, 'learning_rate': 5.279180709527765e-07, 'epoch': 5.75}\n",
-      "{'loss': 0.0084, 'grad_norm': 0.4064849019050598, 'learning_rate': 4.553052520375911e-07, 'epoch': 5.77}\n",
-      "{'loss': 0.0033, 'grad_norm': 0.2132396250963211, 'learning_rate': 3.8803966999139684e-07, 'epoch': 5.78}\n",
-      "{'loss': 0.0176, 'grad_norm': 2.6782572269439697, 'learning_rate': 3.261285846227868e-07, 'epoch': 5.8}\n",
-      "{'loss': 0.0064, 'grad_norm': 0.27686187624931335, 'learning_rate': 2.6957867784270787e-07, 'epoch': 5.82}\n",
-      "{'loss': 0.0041, 'grad_norm': 0.86066734790802, 'learning_rate': 2.1839605294330933e-07, 'epoch': 5.84}\n",
-      "{'loss': 0.0082, 'grad_norm': 0.16934335231781006, 'learning_rate': 1.725862339392259e-07, 'epoch': 5.85}\n",
-      "{'loss': 0.0047, 'grad_norm': 0.6522320508956909, 'learning_rate': 1.3215416497138754e-07, 'epoch': 5.87}\n",
-      "{'loss': 0.0063, 'grad_norm': 0.5966488718986511, 'learning_rate': 9.710420977340762e-08, 'epoch': 5.89}\n",
-      "{'loss': 0.0038, 'grad_norm': 0.1901843547821045, 'learning_rate': 6.744015120061509e-08, 'epoch': 5.91}\n",
-      "{'loss': 0.0123, 'grad_norm': 2.4536399841308594, 'learning_rate': 4.316519082179227e-08, 'epoch': 5.93}\n",
-      "{'loss': 0.0048, 'grad_norm': 0.5865656733512878, 'learning_rate': 2.4281948573617874e-08, 'epoch': 5.94}\n",
-      "{'loss': 0.006, 'grad_norm': 0.9566450715065002, 'learning_rate': 1.0792462477909882e-08, 'epoch': 5.96}\n",
-      "{'loss': 0.0043, 'grad_norm': 1.3847167491912842, 'learning_rate': 2.6981884216847884e-09, 'epoch': 5.98}\n",
-      "{'loss': 0.0049, 'grad_norm': 1.5407752990722656, 'learning_rate': 0.0, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [6:19:44<00:00,  6.09s/it][INFO|trainer.py:3788] 2024-07-04 21:46:06,786 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 21:46:06,786 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 21:46:06,786 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  4%|█▉                                          | 2/46 [00:00<00:02, 14.89it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:00<00:04,  8.77it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:05,  7.73it/s]\u001b[A\n",
-      " 15%|██████▋                                     | 7/46 [00:00<00:05,  7.51it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:01<00:05,  7.23it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:01<00:05,  7.01it/s]\u001b[A\n",
-      " 22%|█████████▎                                 | 10/46 [00:01<00:05,  6.49it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:01<00:06,  5.13it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:02<00:09,  3.59it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:04<00:25,  1.31it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:04<00:21,  1.48it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:04<00:17,  1.80it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:05<00:14,  2.05it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:05<00:12,  2.33it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:05<00:10,  2.67it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:05<00:09,  2.93it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:06<00:08,  3.07it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:06<00:08,  2.99it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:07<00:09,  2.66it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:07<00:09,  2.37it/s]\u001b[A\n",
-      " 52%|█████████████████���████▍                    | 24/46 [00:08<00:09,  2.37it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:08<00:08,  2.49it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:08<00:07,  2.56it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:09<00:07,  2.63it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:09<00:06,  2.76it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:09<00:05,  2.96it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:09<00:05,  3.06it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:10<00:04,  3.26it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:10<00:04,  3.45it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:10<00:03,  3.75it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:10<00:02,  4.03it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:11<00:02,  4.23it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:11<00:02,  4.50it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:11<00:02,  4.37it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:11<00:01,  4.16it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:12<00:01,  4.24it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:12<00:01,  4.10it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:12<00:01,  3.90it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:12<00:01,  3.65it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:13<00:00,  3.55it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:13<00:00,  3.46it/s]\u001b[A\n",
-      " 98%|██████████████████████████████████████████ | 45/46 [00:13<00:00,  3.14it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.9878008365631104, 'eval_runtime': 14.6844, 'eval_samples_per_second': 3.133, 'eval_steps_per_second': 3.133, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [6:19:59<00:00,  6.09s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:14<00:00,  2.44it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 21:46:21,487 >> Saving model checkpoint to saves/qwen2-7b/lora/sft/checkpoint-3360\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 21:46:23,425 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 21:46:23,426 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-7B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 3584,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 18944,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 28,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 4,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 131072,\n",
-      "  \"tie_word_embeddings\": false,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 152064\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 21:46:23,565 >> tokenizer config file saved in saves/qwen2-7b/lora/sft/checkpoint-3360/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 21:46:23,565 >> Special tokens file saved in saves/qwen2-7b/lora/sft/checkpoint-3360/special_tokens_map.json\n",
-      "[INFO|<string>:482] 2024-07-04 21:46:23,978 >> \n",
-      "\n",
-      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
-      "\n",
-      "\n",
-      "{'train_runtime': 22807.0531, 'train_samples_per_second': 1.179, 'train_steps_per_second': 0.147, 'train_loss': 0.5189488330479002, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [6:20:01<00:00,  6.79s/it]\n",
-      "[INFO|trainer.py:3478] 2024-07-04 21:46:23,983 >> Saving model checkpoint to saves/qwen2-7b/lora/sft\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 21:46:25,525 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-7b-instruct-bnb-4bit/snapshots/8d8ce83e5c9fc23482eeae78027d1fc87bc2edad/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 21:46:25,525 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-7B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 3584,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 18944,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 28,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 4,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 131072,\n",
-      "  \"tie_word_embeddings\": false,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 152064\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 21:46:25,650 >> tokenizer config file saved in saves/qwen2-7b/lora/sft/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 21:46:25,650 >> Special tokens file saved in saves/qwen2-7b/lora/sft/special_tokens_map.json\n",
-      "***** train metrics *****\n",
-      "  epoch                    =     5.9973\n",
-      "  total_flos               = 89914948GF\n",
-      "  train_loss               =     0.5189\n",
-      "  train_runtime            = 6:20:07.05\n",
-      "  train_samples_per_second =      1.179\n",
-      "  train_steps_per_second   =      0.147\n",
-      "Figure saved at: saves/qwen2-7b/lora/sft/training_loss.png\n",
-      "Figure saved at: saves/qwen2-7b/lora/sft/training_eval_loss.png\n",
-      "[INFO|trainer.py:3788] 2024-07-04 21:46:26,044 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 21:46:26,044 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 21:46:26,045 >>   Batch size = 1\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:08<00:00,  5.41it/s]\n",
-      "***** eval metrics *****\n",
-      "  epoch                   =     5.9973\n",
-      "  eval_loss               =     2.9878\n",
-      "  eval_runtime            = 0:00:08.78\n",
-      "  eval_samples_per_second =      5.234\n",
-      "  eval_steps_per_second   =      5.234\n",
-      "[INFO|modelcard.py:449] 2024-07-04 21:46:34,837 >> Dropping the following result as it does not have all the necessary fields:\n",
-      "{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: / 0.092 MB of 0.092 MB uploaded\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               eval/loss ▁▂▃▅▇██\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:            eval/runtime █▇█▆▅█▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: eval/samples_per_second ▁▂▁▂▃▁█\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:   eval/steps_per_second ▁▂▁▂▃▁█\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:             train/epoch ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:       train/global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:         train/grad_norm ▃▂▃▃▃▄▄▄▆▆▄▅▅▆▅▆▅▅▅▇▄▆▇▅▄▄▄█▃▂▄▄▃▁▁▁▂▁▁▃\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:     train/learning_rate ▂▄▅▇██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/loss ███▇▇▇▇▅▅▅▅▅▅▅▃▃▃▃▂▂▁▁▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:                eval/loss 2.9878\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:             eval/runtime 8.7891\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:  eval/samples_per_second 5.234\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:    eval/steps_per_second 5.234\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               total_flos 9.654544053942682e+16\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/epoch 5.99732\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:        train/global_step 3360\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:          train/grad_norm 1.54078\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:      train/learning_rate 0.0\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               train/loss 0.0049\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               train_loss 0.51895\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:            train_runtime 22807.0531\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: train_samples_per_second 1.179\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:   train_steps_per_second 0.147\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mqwen2_7b_lora_sft\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface/runs/o710838e\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at: \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 6 W&B file(s), 0 media file(s), 1 artifact file(s) and 0 other file(s)\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240704_152618-o710838e/logs\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The new W&B backend becomes opt-out in version 0.18.0; try it out with `wandb.require(\"core\")`! See https://wandb.me/wandb-core for more information.\n",
-      "CPU times: user 23min 50s, sys: 8min 47s, total: 32min 37s\n",
-      "Wall time: 6h 56min 32s\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "\n",
-    "!./scripts/tune-lf.sh config/qwen2_7b_lora_sft_unsloth.yaml"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Current Directory:\n",
-      "/home/inflaton/code/projects/courses/llm-finetuning/llama-factory\n",
-      "07/04/2024 21:56:42 - WARNING - llamafactory.hparams.parser - We recommend enable `upcast_layernorm` in quantized training.\n",
-      "07/04/2024 21:56:42 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:56:42,789 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/vocab.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:56:42,789 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/merges.txt\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:56:42,789 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/tokenizer.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:56:42,789 >> loading file added_tokens.json from cache at None\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:56:42,789 >> loading file special_tokens_map.json from cache at None\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:56:42,789 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/tokenizer_config.json\n",
-      "[WARNING|logging.py:313] 2024-07-04 21:56:42,918 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-      "07/04/2024 21:56:42 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>\n",
-      "07/04/2024 21:56:42 - INFO - llamafactory.data.template - Add <|im_start|> to stop words.\n",
-      "07/04/2024 21:56:42 - INFO - llamafactory.data.loader - Loading dataset alpaca_mac.json...\n",
-      "Converting format of dataset (num_proc=16): 100%|█| 4528/4528 [00:00<00:00, 1521\n",
-      "Running tokenizer on dataset (num_proc=16): 100%|█| 4528/4528 [00:01<00:00, 2757\n",
-      "input_ids:\n",
-      "[151644, 872, 198, 5501, 14683, 279, 2701, 8453, 1467, 1119, 6364, 323, 3410, 1172, 279, 24531, 2213, 11, 4302, 770, 624, 35987, 102895, 99164, 100324, 100717, 100095, 99509, 1773, 151645, 198, 151644, 77091, 198, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n",
-      "inputs:\n",
-      "<|im_start|>user\n",
-      "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n",
-      "全仗着狐仙搭救。<|im_end|>\n",
-      "<|im_start|>assistant\n",
-      "Because I was protected by a fox fairy.<|im_end|>\n",
-      "label_ids:\n",
-      "[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n",
-      "labels:\n",
-      "Because I was protected by a fox fairy.<|im_end|>\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 21:56:47,196 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-1.5B-Instruct/snapshots/ba1cf1846d7df0a0591d6c00649f57e798519da8/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 21:56:47,197 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-1.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "07/04/2024 21:56:47 - INFO - llamafactory.model.model_utils.quantization - Quantizing model to 4 bit with bitsandbytes.\n",
-      "🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 21:56:48,123 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 21:56:48,123 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"unsloth/qwen2-1.5b-instruct-bnb-4bit\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "==((====))==  Unsloth: Fast Qwen2 patching release 2024.6\n",
-      "   \\\\   /|    GPU: NVIDIA GeForce RTX 4080 Laptop GPU. Max memory: 11.994 GB. Platform = Linux.\n",
-      "O^O/ \\_/ \\    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.\n",
-      "\\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.\n",
-      " \"-____-\"     Free Apache license: http://github.com/unslothai/unsloth\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 21:56:49,865 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 21:56:49,865 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"unsloth/qwen2-1.5b-instruct-bnb-4bit\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 21:56:50,495 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 21:56:50,496 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"unsloth/qwen2-1.5b-instruct-bnb-4bit\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|modeling_utils.py:3556] 2024-07-04 21:56:50,707 >> loading weights file model.safetensors from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/model.safetensors\n",
-      "[INFO|modeling_utils.py:1531] 2024-07-04 21:56:56,626 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.\n",
-      "[INFO|configuration_utils.py:1000] 2024-07-04 21:56:56,631 >> Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645\n",
-      "}\n",
-      "\n",
-      "[INFO|modeling_utils.py:4364] 2024-07-04 21:58:31,535 >> All model checkpoint weights were used when initializing Qwen2ForCausalLM.\n",
-      "\n",
-      "[INFO|modeling_utils.py:4372] 2024-07-04 21:58:31,535 >> All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at unsloth/qwen2-1.5b-instruct-bnb-4bit.\n",
-      "If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.\n",
-      "[INFO|configuration_utils.py:955] 2024-07-04 21:58:32,073 >> loading configuration file generation_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/generation_config.json\n",
-      "[INFO|configuration_utils.py:1000] 2024-07-04 21:58:32,073 >> Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"do_sample\": true,\n",
-      "  \"eos_token_id\": [\n",
-      "    151645,\n",
-      "    151643\n",
-      "  ],\n",
-      "  \"pad_token_id\": 151643,\n",
-      "  \"repetition_penalty\": 1.1,\n",
-      "  \"temperature\": 0.7,\n",
-      "  \"top_k\": 20,\n",
-      "  \"top_p\": 0.8\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:58:33,489 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/vocab.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:58:33,489 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/merges.txt\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:58:33,489 >> loading file added_tokens.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/added_tokens.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:58:33,490 >> loading file special_tokens_map.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/special_tokens_map.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:58:33,490 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:58:33,490 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/tokenizer.json\n",
-      "[WARNING|logging.py:313] 2024-07-04 21:58:33,937 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:58:34,912 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/vocab.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:58:34,912 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/merges.txt\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:58:34,912 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/tokenizer.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:58:34,912 >> loading file added_tokens.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/added_tokens.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:58:34,912 >> loading file special_tokens_map.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/special_tokens_map.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-04 21:58:34,912 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/tokenizer_config.json\n",
-      "[WARNING|logging.py:313] 2024-07-04 21:58:35,100 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-      "07/04/2024 21:58:35 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.\n",
-      "07/04/2024 21:58:35 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.\n",
-      "07/04/2024 21:58:35 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA\n",
-      "07/04/2024 21:58:35 - INFO - llamafactory.model.model_utils.misc - Found linear modules: v_proj,k_proj,down_proj,gate_proj,q_proj,o_proj,up_proj\n",
-      "[WARNING|logging.py:328] 2024-07-04 21:58:36,612 >> Unsloth 2024.6 patched 28 layers with 0 QKV layers, 28 O layers and 28 MLP layers.\n",
-      "07/04/2024 21:58:37 - INFO - llamafactory.model.loader - trainable params: 9,232,384 || all params: 1,786,320,384 || trainable%: 0.5168\n",
-      "[INFO|trainer.py:642] 2024-07-04 21:58:37,463 >> Using auto half precision backend\n",
-      "07/04/2024 21:58:37 - WARNING - llamafactory.train.callbacks - Previous trainer log in this folder will be deleted.\n",
-      "07/04/2024 21:58:37 - INFO - llamafactory.train.trainer_utils - Using LoRA+ optimizer with loraplus lr ratio 16.00.\n",
-      "[WARNING|<string>:223] 2024-07-04 21:58:37,613 >> ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1\n",
-      "   \\\\   /|    Num examples = 4,482 | Num Epochs = 6\n",
-      "O^O/ \\_/ \\    Batch size per device = 1 | Gradient Accumulation steps = 8\n",
-      "\\        /    Total batch size = 8 | Total steps = 3,360\n",
-      " \"-____-\"     Number of trainable parameters = 9,232,384\n",
-      "[INFO|integration_utils.py:750] 2024-07-04 21:58:38,026 >> Automatic Weights & Biases logging enabled, to disable set os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33minflaton-sg\u001b[0m (\u001b[33minflaton-ai\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.17.4\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m/home/inflaton/code/projects/courses/llm-finetuning/llama-factory/wandb/run-20240704_215839-4fbnqsea\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mqwen2_1.5b_lora_sft\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface/runs/4fbnqsea\u001b[0m\n",
-      "{'loss': 2.2167, 'grad_norm': 1.7105902433395386, 'learning_rate': 2.9761904761904763e-06, 'epoch': 0.02}\n",
-      "{'loss': 2.2613, 'grad_norm': 2.196908712387085, 'learning_rate': 5.9523809523809525e-06, 'epoch': 0.04}\n",
-      "{'loss': 2.0707, 'grad_norm': 0.9740070104598999, 'learning_rate': 8.92857142857143e-06, 'epoch': 0.05}\n",
-      "{'loss': 1.9514, 'grad_norm': 1.8389497995376587, 'learning_rate': 1.1904761904761905e-05, 'epoch': 0.07}\n",
-      "{'loss': 2.1174, 'grad_norm': 1.03471839427948, 'learning_rate': 1.4880952380952381e-05, 'epoch': 0.09}\n",
-      "{'loss': 1.8992, 'grad_norm': 1.198785662651062, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.11}\n",
-      "{'loss': 2.0404, 'grad_norm': 1.114922285079956, 'learning_rate': 2.0833333333333336e-05, 'epoch': 0.12}\n",
-      "{'loss': 1.8447, 'grad_norm': 1.1239877939224243, 'learning_rate': 2.380952380952381e-05, 'epoch': 0.14}\n",
-      "{'loss': 1.9283, 'grad_norm': 1.5919139385223389, 'learning_rate': 2.6785714285714288e-05, 'epoch': 0.16}\n",
-      "{'loss': 1.9026, 'grad_norm': 0.998127818107605, 'learning_rate': 2.9761904761904762e-05, 'epoch': 0.18}\n",
-      "{'loss': 1.8932, 'grad_norm': 1.1430412530899048, 'learning_rate': 3.273809523809524e-05, 'epoch': 0.2}\n",
-      "{'loss': 1.8906, 'grad_norm': 1.0670546293258667, 'learning_rate': 3.571428571428572e-05, 'epoch': 0.21}\n",
-      "{'loss': 1.8343, 'grad_norm': 1.4356828927993774, 'learning_rate': 3.8690476190476195e-05, 'epoch': 0.23}\n",
-      "{'loss': 1.8725, 'grad_norm': 1.7043449878692627, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.25}\n",
-      "{'loss': 1.7689, 'grad_norm': 1.09099543094635, 'learning_rate': 4.464285714285715e-05, 'epoch': 0.27}\n",
-      "{'loss': 1.7491, 'grad_norm': 0.9564052224159241, 'learning_rate': 4.761904761904762e-05, 'epoch': 0.29}\n",
-      "{'loss': 1.8979, 'grad_norm': 1.0268529653549194, 'learning_rate': 5.05952380952381e-05, 'epoch': 0.3}\n",
-      "{'loss': 1.695, 'grad_norm': 1.2585980892181396, 'learning_rate': 5.3571428571428575e-05, 'epoch': 0.32}\n",
-      "{'loss': 1.803, 'grad_norm': 2.146714448928833, 'learning_rate': 5.6547619047619046e-05, 'epoch': 0.34}\n",
-      "{'loss': 1.9439, 'grad_norm': 1.163086175918579, 'learning_rate': 5.9523809523809524e-05, 'epoch': 0.36}\n",
-      "{'loss': 1.8679, 'grad_norm': 1.2789676189422607, 'learning_rate': 6.25e-05, 'epoch': 0.37}\n",
-      "{'loss': 1.7942, 'grad_norm': 1.5350133180618286, 'learning_rate': 6.547619047619048e-05, 'epoch': 0.39}\n",
-      "{'loss': 1.7723, 'grad_norm': 1.333762526512146, 'learning_rate': 6.845238095238096e-05, 'epoch': 0.41}\n",
-      "{'loss': 1.9781, 'grad_norm': 1.342468500137329, 'learning_rate': 7.142857142857143e-05, 'epoch': 0.43}\n",
-      "{'loss': 1.8381, 'grad_norm': 1.785408616065979, 'learning_rate': 7.440476190476191e-05, 'epoch': 0.45}\n",
-      "{'loss': 1.77, 'grad_norm': 1.5936214923858643, 'learning_rate': 7.738095238095239e-05, 'epoch': 0.46}\n",
-      "{'loss': 1.8368, 'grad_norm': 1.7655868530273438, 'learning_rate': 8.035714285714287e-05, 'epoch': 0.48}\n",
-      "{'loss': 1.838, 'grad_norm': 1.5333795547485352, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.5}\n",
-      "{'loss': 1.6551, 'grad_norm': 1.4578733444213867, 'learning_rate': 8.630952380952382e-05, 'epoch': 0.52}\n",
-      "{'loss': 1.8046, 'grad_norm': 1.649754524230957, 'learning_rate': 8.92857142857143e-05, 'epoch': 0.54}\n",
-      "{'loss': 1.8364, 'grad_norm': 1.618801236152649, 'learning_rate': 9.226190476190478e-05, 'epoch': 0.55}\n",
-      "{'loss': 1.6749, 'grad_norm': 2.321563243865967, 'learning_rate': 9.523809523809524e-05, 'epoch': 0.57}\n",
-      "{'loss': 1.7095, 'grad_norm': 1.7713825702667236, 'learning_rate': 9.821428571428572e-05, 'epoch': 0.59}\n",
-      "{'loss': 1.7458, 'grad_norm': 2.338412046432495, 'learning_rate': 9.999956828659095e-05, 'epoch': 0.61}\n",
-      "{'loss': 1.7693, 'grad_norm': 2.676462173461914, 'learning_rate': 9.999471159635539e-05, 'epoch': 0.62}\n",
-      "{'loss': 1.702, 'grad_norm': 1.777328610420227, 'learning_rate': 9.998445910004082e-05, 'epoch': 0.64}\n",
-      "{'loss': 1.8997, 'grad_norm': 2.657947063446045, 'learning_rate': 9.996881190417393e-05, 'epoch': 0.66}\n",
-      "{'loss': 1.8264, 'grad_norm': 1.7980377674102783, 'learning_rate': 9.994777169751806e-05, 'epoch': 0.68}\n",
-      "{'loss': 1.5464, 'grad_norm': 1.6675528287887573, 'learning_rate': 9.992134075089084e-05, 'epoch': 0.7}\n",
-      "{'loss': 1.7621, 'grad_norm': 2.088773727416992, 'learning_rate': 9.988952191691925e-05, 'epoch': 0.71}\n",
-      "{'loss': 1.7907, 'grad_norm': 1.8195936679840088, 'learning_rate': 9.985231862973168e-05, 'epoch': 0.73}\n",
-      "{'loss': 1.8215, 'grad_norm': 1.8611762523651123, 'learning_rate': 9.980973490458728e-05, 'epoch': 0.75}\n",
-      "{'loss': 1.7694, 'grad_norm': 2.018522024154663, 'learning_rate': 9.976177533744261e-05, 'epoch': 0.77}\n",
-      "{'loss': 1.741, 'grad_norm': 2.393932342529297, 'learning_rate': 9.97084451044556e-05, 'epoch': 0.79}\n",
-      "{'loss': 1.6568, 'grad_norm': 1.8010462522506714, 'learning_rate': 9.964974996142698e-05, 'epoch': 0.8}\n",
-      "{'loss': 1.7109, 'grad_norm': 1.6937175989151, 'learning_rate': 9.958569624317893e-05, 'epoch': 0.82}\n",
-      "{'loss': 1.7973, 'grad_norm': 2.7904672622680664, 'learning_rate': 9.951629086287151e-05, 'epoch': 0.84}\n",
-      "{'loss': 1.7033, 'grad_norm': 1.759727954864502, 'learning_rate': 9.944154131125642e-05, 'epoch': 0.86}\n",
-      "{'loss': 1.7797, 'grad_norm': 1.7603638172149658, 'learning_rate': 9.936145565586871e-05, 'epoch': 0.87}\n",
-      "{'loss': 1.9387, 'grad_norm': 1.8501616716384888, 'learning_rate': 9.927604254015585e-05, 'epoch': 0.89}\n",
-      "{'loss': 1.8734, 'grad_norm': 1.8340226411819458, 'learning_rate': 9.918531118254507e-05, 'epoch': 0.91}\n",
-      "{'loss': 1.7725, 'grad_norm': 2.32716703414917, 'learning_rate': 9.90892713754483e-05, 'epoch': 0.93}\n",
-      "{'loss': 1.7641, 'grad_norm': 2.673140048980713, 'learning_rate': 9.898793348420536e-05, 'epoch': 0.95}\n",
-      "{'loss': 1.7813, 'grad_norm': 1.9481444358825684, 'learning_rate': 9.888130844596524e-05, 'epoch': 0.96}\n",
-      "{'loss': 1.7363, 'grad_norm': 2.068895101547241, 'learning_rate': 9.876940776850569e-05, 'epoch': 0.98}\n",
-      "{'loss': 1.725, 'grad_norm': 1.8741337060928345, 'learning_rate': 9.865224352899119e-05, 'epoch': 1.0}\n",
-      " 17%|██████▎                               | 560/3360 [20:10<1:47:08,  2.30s/it][INFO|trainer.py:3788] 2024-07-04 22:18:54,222 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 22:18:54,223 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 22:18:54,223 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:02, 16.12it/s]\u001b[A\n",
-      " 11%|████▊                                       | 5/46 [00:00<00:02, 14.12it/s]\u001b[A\n",
-      " 15%|██████▋                                     | 7/46 [00:00<00:02, 13.49it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:00<00:02, 12.62it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:00<00:03, 10.63it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:01<00:03,  9.63it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:01<00:03,  9.49it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:01<00:03,  9.30it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:01<00:03,  8.91it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:01<00:03,  8.44it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:01<00:03,  8.19it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:02<00:03,  8.03it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:02<00:03,  7.75it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:02<00:03,  7.66it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:02<00:03,  7.52it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:02<00:02,  7.59it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:02<00:02,  8.62it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:02<00:02,  8.84it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:03<00:01,  9.63it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:03<00:01, 10.47it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:03<00:01, 10.89it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:03<00:00, 11.74it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:03<00:00, 12.31it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:03<00:00, 12.75it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:03<00:00, 12.89it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:04<00:00, 13.07it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 1.7500004768371582, 'eval_runtime': 4.4502, 'eval_samples_per_second': 10.337, 'eval_steps_per_second': 10.337, 'epoch': 1.0}\n",
-      " 17%|██████▎                               | 560/3360 [20:14<1:47:08,  2.30s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:04<00:00, 13.30it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 22:18:58,675 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-560\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 22:18:59,836 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 22:18:59,838 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-1.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 22:18:59,966 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-560/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 22:18:59,967 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-560/special_tokens_map.json\n",
-      "{'loss': 1.3163, 'grad_norm': 2.1074018478393555, 'learning_rate': 9.852982837266955e-05, 'epoch': 1.02}\n",
-      "{'loss': 1.1057, 'grad_norm': 2.284273147583008, 'learning_rate': 9.840217551150706e-05, 'epoch': 1.04}\n",
-      "{'loss': 1.3071, 'grad_norm': 1.8365180492401123, 'learning_rate': 9.826929872276255e-05, 'epoch': 1.05}\n",
-      "{'loss': 1.2093, 'grad_norm': 2.285874843597412, 'learning_rate': 9.81312123475006e-05, 'epoch': 1.07}\n",
-      "{'loss': 1.1653, 'grad_norm': 2.244819402694702, 'learning_rate': 9.798793128904356e-05, 'epoch': 1.09}\n",
-      "{'loss': 1.276, 'grad_norm': 2.5032386779785156, 'learning_rate': 9.78394710113631e-05, 'epoch': 1.11}\n",
-      "{'loss': 1.2116, 'grad_norm': 2.2835264205932617, 'learning_rate': 9.768584753741134e-05, 'epoch': 1.12}\n",
-      "{'loss': 1.1953, 'grad_norm': 3.390573740005493, 'learning_rate': 9.752707744739145e-05, 'epoch': 1.14}\n",
-      "{'loss': 1.2537, 'grad_norm': 2.312870502471924, 'learning_rate': 9.736317787696816e-05, 'epoch': 1.16}\n",
-      "{'loss': 1.1042, 'grad_norm': 2.252488851547241, 'learning_rate': 9.719416651541839e-05, 'epoch': 1.18}\n",
-      "{'loss': 1.1397, 'grad_norm': 1.93602454662323, 'learning_rate': 9.702006160372209e-05, 'epoch': 1.2}\n",
-      "{'loss': 1.1822, 'grad_norm': 2.3258895874023438, 'learning_rate': 9.684088193259355e-05, 'epoch': 1.21}\n",
-      "{'loss': 1.1777, 'grad_norm': 2.362091064453125, 'learning_rate': 9.665664684045333e-05, 'epoch': 1.23}\n",
-      "{'loss': 1.2182, 'grad_norm': 2.36861515045166, 'learning_rate': 9.646737621134112e-05, 'epoch': 1.25}\n",
-      "{'loss': 1.181, 'grad_norm': 3.928402900695801, 'learning_rate': 9.627309047276974e-05, 'epoch': 1.27}\n",
-      "{'loss': 1.3375, 'grad_norm': 3.1305952072143555, 'learning_rate': 9.607381059352038e-05, 'epoch': 1.29}\n",
-      "{'loss': 1.2559, 'grad_norm': 2.16672682762146, 'learning_rate': 9.586955808137958e-05, 'epoch': 1.3}\n",
-      "{'loss': 1.26, 'grad_norm': 2.531378984451294, 'learning_rate': 9.566035498081784e-05, 'epoch': 1.32}\n",
-      "{'loss': 1.2656, 'grad_norm': 2.2649450302124023, 'learning_rate': 9.544622387061055e-05, 'epoch': 1.34}\n",
-      "{'loss': 1.0581, 'grad_norm': 2.7688372135162354, 'learning_rate': 9.522718786140097e-05, 'epoch': 1.36}\n",
-      "{'loss': 1.2188, 'grad_norm': 3.3669986724853516, 'learning_rate': 9.500327059320606e-05, 'epoch': 1.37}\n",
-      "{'loss': 1.1538, 'grad_norm': 3.8478970527648926, 'learning_rate': 9.477449623286505e-05, 'epoch': 1.39}\n",
-      "{'loss': 1.0648, 'grad_norm': 2.5197343826293945, 'learning_rate': 9.454088947143116e-05, 'epoch': 1.41}\n",
-      "{'loss': 1.2997, 'grad_norm': 3.149819850921631, 'learning_rate': 9.430247552150673e-05, 'epoch': 1.43}\n",
-      "{'loss': 1.1716, 'grad_norm': 2.626891851425171, 'learning_rate': 9.405928011452211e-05, 'epoch': 1.45}\n",
-      "{'loss': 1.2123, 'grad_norm': 2.029723644256592, 'learning_rate': 9.381132949795861e-05, 'epoch': 1.46}\n",
-      "{'loss': 1.3143, 'grad_norm': 2.6693994998931885, 'learning_rate': 9.35586504325155e-05, 'epoch': 1.48}\n",
-      "{'loss': 1.2098, 'grad_norm': 2.4133574962615967, 'learning_rate': 9.330127018922194e-05, 'epoch': 1.5}\n",
-      "{'loss': 1.1153, 'grad_norm': 2.2110259532928467, 'learning_rate': 9.303921654649362e-05, 'epoch': 1.52}\n",
-      "{'loss': 1.2865, 'grad_norm': 2.425077438354492, 'learning_rate': 9.277251778713474e-05, 'epoch': 1.54}\n",
-      "{'loss': 1.2322, 'grad_norm': 2.287026882171631, 'learning_rate': 9.250120269528546e-05, 'epoch': 1.55}\n",
-      "{'loss': 1.1479, 'grad_norm': 2.3768105506896973, 'learning_rate': 9.22253005533154e-05, 'epoch': 1.57}\n",
-      "{'loss': 1.2783, 'grad_norm': 3.2799324989318848, 'learning_rate': 9.194484113866313e-05, 'epoch': 1.59}\n",
-      "{'loss': 1.3401, 'grad_norm': 2.8332979679107666, 'learning_rate': 9.165985472062246e-05, 'epoch': 1.61}\n",
-      "{'loss': 1.0948, 'grad_norm': 2.450061321258545, 'learning_rate': 9.137037205707552e-05, 'epoch': 1.62}\n",
-      "{'loss': 1.1901, 'grad_norm': 2.617992401123047, 'learning_rate': 9.107642439117321e-05, 'epoch': 1.64}\n",
-      "{'loss': 1.2412, 'grad_norm': 2.531679391860962, 'learning_rate': 9.077804344796302e-05, 'epoch': 1.66}\n",
-      "{'loss': 1.1875, 'grad_norm': 2.6147513389587402, 'learning_rate': 9.04752614309652e-05, 'epoch': 1.68}\n",
-      "{'loss': 1.1308, 'grad_norm': 3.1184866428375244, 'learning_rate': 9.01681110186971e-05, 'epoch': 1.7}\n",
-      "{'loss': 1.2466, 'grad_norm': 2.7524633407592773, 'learning_rate': 8.985662536114613e-05, 'epoch': 1.71}\n",
-      "{'loss': 1.1582, 'grad_norm': 2.410403251647949, 'learning_rate': 8.954083807619208e-05, 'epoch': 1.73}\n",
-      "{'loss': 1.2996, 'grad_norm': 3.132530927658081, 'learning_rate': 8.922078324597879e-05, 'epoch': 1.75}\n",
-      "{'loss': 1.3292, 'grad_norm': 3.093569755554199, 'learning_rate': 8.889649541323574e-05, 'epoch': 1.77}\n",
-      "{'loss': 1.2493, 'grad_norm': 2.8685665130615234, 'learning_rate': 8.856800957755e-05, 'epoch': 1.78}\n",
-      "{'loss': 1.2413, 'grad_norm': 3.4880857467651367, 'learning_rate': 8.823536119158864e-05, 'epoch': 1.8}\n",
-      "{'loss': 1.2145, 'grad_norm': 3.321408271789551, 'learning_rate': 8.789858615727265e-05, 'epoch': 1.82}\n",
-      "{'loss': 1.1431, 'grad_norm': 2.608922243118286, 'learning_rate': 8.755772082190194e-05, 'epoch': 1.84}\n",
-      "{'loss': 1.2395, 'grad_norm': 2.6196181774139404, 'learning_rate': 8.721280197423258e-05, 'epoch': 1.86}\n",
-      "{'loss': 1.0924, 'grad_norm': 3.0364978313446045, 'learning_rate': 8.68638668405062e-05, 'epoch': 1.87}\n",
-      "{'loss': 1.2218, 'grad_norm': 3.5102291107177734, 'learning_rate': 8.651095308043232e-05, 'epoch': 1.89}\n",
-      "{'loss': 1.2639, 'grad_norm': 4.278683662414551, 'learning_rate': 8.61540987831238e-05, 'epoch': 1.91}\n",
-      "{'loss': 1.2978, 'grad_norm': 3.729332208633423, 'learning_rate': 8.579334246298593e-05, 'epoch': 1.93}\n",
-      "{'loss': 1.1956, 'grad_norm': 3.6756839752197266, 'learning_rate': 8.542872305555978e-05, 'epoch': 1.95}\n",
-      "{'loss': 1.1345, 'grad_norm': 2.913640022277832, 'learning_rate': 8.50602799133199e-05, 'epoch': 1.96}\n",
-      "{'loss': 1.217, 'grad_norm': 2.75384259223938, 'learning_rate': 8.468805280142709e-05, 'epoch': 1.98}\n",
-      "{'loss': 1.2316, 'grad_norm': 3.1801509857177734, 'learning_rate': 8.43120818934367e-05, 'epoch': 2.0}\n",
-      " 33%|████████████▎                        | 1120/3360 [41:10<1:24:27,  2.26s/it][INFO|trainer.py:3788] 2024-07-04 22:39:54,830 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 22:39:54,830 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 22:39:54,830 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:02, 19.32it/s]\u001b[A\n",
-      " 11%|████▊                                       | 5/46 [00:00<00:02, 14.57it/s]\u001b[A\n",
-      " 15%|██████▋                                     | 7/46 [00:00<00:03, 12.84it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:00<00:02, 12.95it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:00<00:02, 12.99it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:00<00:02, 13.33it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:01<00:02, 13.58it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:01<00:02, 13.47it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:01<00:01, 13.51it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:01<00:01, 13.40it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:01<00:01, 13.16it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:01<00:01, 13.31it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:02<00:01, 13.34it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:02<00:01, 13.10it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:02<00:01, 13.09it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:02<00:01, 12.12it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:02<00:00, 11.79it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:02<00:00, 11.96it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:03<00:00, 11.44it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:03<00:00, 11.61it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:03<00:00, 11.34it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 1.8573294878005981, 'eval_runtime': 3.7539, 'eval_samples_per_second': 12.254, 'eval_steps_per_second': 12.254, 'epoch': 2.0}\n",
-      " 33%|████████████▎                        | 1120/3360 [41:14<1:24:27,  2.26s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:03<00:00, 11.28it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 22:39:58,587 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-1120\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 22:39:59,689 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 22:39:59,690 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-1.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 22:39:59,739 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-1120/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 22:39:59,740 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-1120/special_tokens_map.json\n",
-      "{'loss': 0.6282, 'grad_norm': 2.980609893798828, 'learning_rate': 8.393240776696274e-05, 'epoch': 2.02}\n",
-      "{'loss': 0.516, 'grad_norm': 2.145615577697754, 'learning_rate': 8.354907139929851e-05, 'epoch': 2.03}\n",
-      "{'loss': 0.5637, 'grad_norm': 2.9827773571014404, 'learning_rate': 8.316211416299397e-05, 'epoch': 2.05}\n",
-      "{'loss': 0.5459, 'grad_norm': 6.992089748382568, 'learning_rate': 8.27715778213905e-05, 'epoch': 2.07}\n",
-      "{'loss': 0.4226, 'grad_norm': 2.0110838413238525, 'learning_rate': 8.237750452411353e-05, 'epoch': 2.09}\n",
-      "{'loss': 0.5595, 'grad_norm': 1.9566326141357422, 'learning_rate': 8.197993680252334e-05, 'epoch': 2.11}\n",
-      "{'loss': 0.6434, 'grad_norm': 3.0069830417633057, 'learning_rate': 8.157891756512488e-05, 'epoch': 2.12}\n",
-      "{'loss': 0.5572, 'grad_norm': 2.4360501766204834, 'learning_rate': 8.117449009293668e-05, 'epoch': 2.14}\n",
-      "{'loss': 0.5111, 'grad_norm': 3.1125354766845703, 'learning_rate': 8.076669803481965e-05, 'epoch': 2.16}\n",
-      "{'loss': 0.5006, 'grad_norm': 3.5583136081695557, 'learning_rate': 8.035558540276618e-05, 'epoch': 2.18}\n",
-      "{'loss': 0.5521, 'grad_norm': 2.597862482070923, 'learning_rate': 7.994119656715002e-05, 'epoch': 2.2}\n",
-      "{'loss': 0.6284, 'grad_norm': 3.2273318767547607, 'learning_rate': 7.952357625193749e-05, 'epoch': 2.21}\n",
-      "{'loss': 0.6074, 'grad_norm': 3.255476474761963, 'learning_rate': 7.91027695298606e-05, 'epoch': 2.23}\n",
-      "{'loss': 0.5721, 'grad_norm': 2.2420713901519775, 'learning_rate': 7.86788218175523e-05, 'epoch': 2.25}\n",
-      "{'loss': 0.5287, 'grad_norm': 3.241563558578491, 'learning_rate': 7.8251778870645e-05, 'epoch': 2.27}\n",
-      "{'loss': 0.5853, 'grad_norm': 3.7581467628479004, 'learning_rate': 7.782168677883206e-05, 'epoch': 2.28}\n",
-      "{'loss': 0.6142, 'grad_norm': 4.938629627227783, 'learning_rate': 7.738859196089358e-05, 'epoch': 2.3}\n",
-      "{'loss': 0.6064, 'grad_norm': 3.4490935802459717, 'learning_rate': 7.695254115968648e-05, 'epoch': 2.32}\n",
-      "{'loss': 0.6328, 'grad_norm': 3.473822832107544, 'learning_rate': 7.651358143709972e-05, 'epoch': 2.34}\n",
-      "{'loss': 0.6386, 'grad_norm': 3.5730648040771484, 'learning_rate': 7.60717601689749e-05, 'epoch': 2.36}\n",
-      "{'loss': 0.5591, 'grad_norm': 3.024034023284912, 'learning_rate': 7.562712503999327e-05, 'epoch': 2.37}\n",
-      "{'loss': 0.7168, 'grad_norm': 3.799771547317505, 'learning_rate': 7.517972403852905e-05, 'epoch': 2.39}\n",
-      "{'loss': 0.5869, 'grad_norm': 3.3111960887908936, 'learning_rate': 7.472960545147038e-05, 'epoch': 2.41}\n",
-      "{'loss': 0.5025, 'grad_norm': 3.5263280868530273, 'learning_rate': 7.427681785900761e-05, 'epoch': 2.43}\n",
-      "{'loss': 0.5964, 'grad_norm': 3.572462797164917, 'learning_rate': 7.382141012939034e-05, 'epoch': 2.45}\n",
-      "{'loss': 0.5491, 'grad_norm': 3.038294792175293, 'learning_rate': 7.33634314136531e-05, 'epoch': 2.46}\n",
-      "{'loss': 0.6004, 'grad_norm': 7.641390800476074, 'learning_rate': 7.290293114031061e-05, 'epoch': 2.48}\n",
-      "{'loss': 0.6356, 'grad_norm': 3.8366777896881104, 'learning_rate': 7.243995901002312e-05, 'epoch': 2.5}\n",
-      "{'loss': 0.5982, 'grad_norm': 3.146303176879883, 'learning_rate': 7.197456499023225e-05, 'epoch': 2.52}\n",
-      "{'loss': 0.6127, 'grad_norm': 4.154056072235107, 'learning_rate': 7.150679930976825e-05, 'epoch': 2.53}\n",
-      "{'loss': 0.5952, 'grad_norm': 2.470127582550049, 'learning_rate': 7.103671245342887e-05, 'epoch': 2.55}\n",
-      "{'loss': 0.4994, 'grad_norm': 5.2111053466796875, 'learning_rate': 7.056435515653059e-05, 'epoch': 2.57}\n",
-      "{'loss': 0.6969, 'grad_norm': 3.3999710083007812, 'learning_rate': 7.008977839943299e-05, 'epoch': 2.59}\n",
-      "{'loss': 0.6066, 'grad_norm': 3.942821979522705, 'learning_rate': 6.961303340203653e-05, 'epoch': 2.61}\n",
-      "{'loss': 0.6744, 'grad_norm': 3.511596918106079, 'learning_rate': 6.91341716182545e-05, 'epoch': 2.62}\n",
-      "{'loss': 0.5972, 'grad_norm': 2.605888605117798, 'learning_rate': 6.86532447304597e-05, 'epoch': 2.64}\n",
-      "{'loss': 0.6058, 'grad_norm': 3.500854730606079, 'learning_rate': 6.817030464390656e-05, 'epoch': 2.66}\n",
-      "{'loss': 0.6422, 'grad_norm': 2.9531426429748535, 'learning_rate': 6.768540348112907e-05, 'epoch': 2.68}\n",
-      "{'loss': 0.5311, 'grad_norm': 4.346620559692383, 'learning_rate': 6.719859357631535e-05, 'epoch': 2.7}\n",
-      "{'loss': 0.4986, 'grad_norm': 4.6267900466918945, 'learning_rate': 6.670992746965938e-05, 'epoch': 2.71}\n",
-      "{'loss': 0.6576, 'grad_norm': 4.185196876525879, 'learning_rate': 6.621945790169036e-05, 'epoch': 2.73}\n",
-      "{'loss': 0.6437, 'grad_norm': 3.265991687774658, 'learning_rate': 6.572723780758069e-05, 'epoch': 2.75}\n",
-      "{'loss': 0.574, 'grad_norm': 4.036723613739014, 'learning_rate': 6.523332031143272e-05, 'epoch': 2.77}\n",
-      "{'loss': 0.5839, 'grad_norm': 3.2608094215393066, 'learning_rate': 6.473775872054521e-05, 'epoch': 2.78}\n",
-      "{'loss': 0.5044, 'grad_norm': 3.3588390350341797, 'learning_rate': 6.424060651966007e-05, 'epoch': 2.8}\n",
-      "{'loss': 0.5707, 'grad_norm': 3.363955020904541, 'learning_rate': 6.374191736518974e-05, 'epoch': 2.82}\n",
-      "{'loss': 0.5785, 'grad_norm': 3.4573071002960205, 'learning_rate': 6.324174507942637e-05, 'epoch': 2.84}\n",
-      "{'loss': 0.5755, 'grad_norm': 4.1820855140686035, 'learning_rate': 6.274014364473274e-05, 'epoch': 2.86}\n",
-      "{'loss': 0.7532, 'grad_norm': 2.9372756481170654, 'learning_rate': 6.22371671977162e-05, 'epoch': 2.87}\n",
-      "{'loss': 0.6447, 'grad_norm': 4.2755632400512695, 'learning_rate': 6.173287002338577e-05, 'epoch': 2.89}\n",
-      "{'loss': 0.6018, 'grad_norm': 4.274354934692383, 'learning_rate': 6.122730654929334e-05, 'epoch': 2.91}\n",
-      "{'loss': 0.5677, 'grad_norm': 4.0272393226623535, 'learning_rate': 6.072053133965938e-05, 'epoch': 2.93}\n",
-      "{'loss': 0.6344, 'grad_norm': 3.0991122722625732, 'learning_rate': 6.021259908948402e-05, 'epoch': 2.95}\n",
-      "{'loss': 0.6559, 'grad_norm': 3.816575527191162, 'learning_rate': 5.970356461864391e-05, 'epoch': 2.96}\n",
-      "{'loss': 0.5647, 'grad_norm': 3.187918186187744, 'learning_rate': 5.919348286597569e-05, 'epoch': 2.98}\n",
-      "{'loss': 0.6381, 'grad_norm': 3.6101670265197754, 'learning_rate': 5.868240888334653e-05, 'epoch': 3.0}\n",
-      " 50%|█████████████████▌                 | 1680/3360 [1:12:00<2:09:10,  4.61s/it][INFO|trainer.py:3788] 2024-07-04 23:10:44,677 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 23:10:44,677 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 23:10:44,677 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  4%|█▉                                          | 2/46 [00:00<00:03, 13.97it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:00<00:05,  8.02it/s]\u001b[A\n",
-      " 11%|████▊                                       | 5/46 [00:00<00:05,  7.56it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:05,  6.98it/s]\u001b[A\n",
-      " 15%|██████▋                                     | 7/46 [00:00<00:05,  7.22it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:01<00:05,  6.73it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:01<00:05,  6.69it/s]\u001b[A\n",
-      " 22%|█████████▎                                 | 10/46 [00:01<00:05,  6.45it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:01<00:04,  7.07it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:01<00:04,  7.70it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:01<00:04,  8.19it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:01<00:03,  8.08it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:02<00:03,  8.79it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:02<00:03,  8.65it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:02<00:03,  8.03it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:02<00:02,  9.13it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:02<00:02,  9.34it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:02<00:02,  9.05it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:02<00:02,  9.08it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:03<00:02,  8.51it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:03<00:02,  8.09it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:03<00:02,  7.71it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:03<00:02,  7.18it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:03<00:02,  6.16it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:03<00:02,  5.74it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:04<00:02,  5.64it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:04<00:02,  6.10it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:04<00:02,  5.99it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:04<00:01,  6.17it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:04<00:01,  6.58it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:04<00:01,  6.82it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:05<00:01,  6.79it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:05<00:01,  6.98it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:05<00:00,  7.14it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:05<00:00,  6.82it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:05<00:00,  6.82it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:05<00:00,  7.10it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:05<00:00,  6.85it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:05<00:00,  7.08it/s]\u001b[A\n",
-      " 98%|██████████████████████████████████████████ | 45/46 [00:06<00:00,  7.02it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.19065260887146, 'eval_runtime': 6.452, 'eval_samples_per_second': 7.13, 'eval_steps_per_second': 7.13, 'epoch': 3.0}\n",
-      " 50%|█████████████████▌                 | 1680/3360 [1:12:07<2:09:10,  4.61s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:06<00:00,  7.19it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 23:10:51,132 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-1680\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 23:10:52,385 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 23:10:52,387 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-1.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 23:10:52,534 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-1680/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 23:10:52,535 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-1680/special_tokens_map.json\n",
-      "{'loss': 0.3795, 'grad_norm': 3.6095104217529297, 'learning_rate': 5.8170397829712485e-05, 'epoch': 3.02}\n",
-      "{'loss': 0.1885, 'grad_norm': 2.648378610610962, 'learning_rate': 5.765750496516547e-05, 'epoch': 3.03}\n",
-      "{'loss': 0.2431, 'grad_norm': 3.102599859237671, 'learning_rate': 5.714378564496901e-05, 'epoch': 3.05}\n",
-      "{'loss': 0.2407, 'grad_norm': 1.7132669687271118, 'learning_rate': 5.6629295313583974e-05, 'epoch': 3.07}\n",
-      "{'loss': 0.1859, 'grad_norm': 2.363086462020874, 'learning_rate': 5.611408949868457e-05, 'epoch': 3.09}\n",
-      "{'loss': 0.2814, 'grad_norm': 2.7699074745178223, 'learning_rate': 5.559822380516539e-05, 'epoch': 3.11}\n",
-      "{'loss': 0.2066, 'grad_norm': 2.309485912322998, 'learning_rate': 5.5081753909140096e-05, 'epoch': 3.12}\n",
-      "{'loss': 0.2561, 'grad_norm': 3.8177757263183594, 'learning_rate': 5.456473555193242e-05, 'epoch': 3.14}\n",
-      "{'loss': 0.2839, 'grad_norm': 5.046483039855957, 'learning_rate': 5.404722453406017e-05, 'epoch': 3.16}\n",
-      "{'loss': 0.2309, 'grad_norm': 3.3046510219573975, 'learning_rate': 5.3529276709212816e-05, 'epoch': 3.18}\n",
-      "{'loss': 0.2678, 'grad_norm': 3.739877939224243, 'learning_rate': 5.30109479782233e-05, 'epoch': 3.2}\n",
-      "{'loss': 0.2305, 'grad_norm': 3.0891871452331543, 'learning_rate': 5.249229428303486e-05, 'epoch': 3.21}\n",
-      "{'loss': 0.3009, 'grad_norm': 2.0775339603424072, 'learning_rate': 5.197337160066331e-05, 'epoch': 3.23}\n",
-      "{'loss': 0.1974, 'grad_norm': 4.094172477722168, 'learning_rate': 5.145423593715557e-05, 'epoch': 3.25}\n",
-      "{'loss': 0.2613, 'grad_norm': 3.4857871532440186, 'learning_rate': 5.0934943321545115e-05, 'epoch': 3.27}\n",
-      "{'loss': 0.1759, 'grad_norm': 5.555017948150635, 'learning_rate': 5.041554979980486e-05, 'epoch': 3.28}\n",
-      "{'loss': 0.2755, 'grad_norm': 5.37070894241333, 'learning_rate': 4.9896111428798254e-05, 'epoch': 3.3}\n",
-      "{'loss': 0.3013, 'grad_norm': 3.0473411083221436, 'learning_rate': 4.9376684270229254e-05, 'epoch': 3.32}\n",
-      "{'loss': 0.2713, 'grad_norm': 2.421534299850464, 'learning_rate': 4.8857324384591653e-05, 'epoch': 3.34}\n",
-      "{'loss': 0.2342, 'grad_norm': 3.430769205093384, 'learning_rate': 4.8338087825118675e-05, 'epoch': 3.36}\n",
-      "{'loss': 0.2836, 'grad_norm': 3.117511510848999, 'learning_rate': 4.781903063173321e-05, 'epoch': 3.37}\n",
-      "{'loss': 0.2305, 'grad_norm': 2.2710249423980713, 'learning_rate': 4.730020882499964e-05, 'epoch': 3.39}\n",
-      "{'loss': 0.2707, 'grad_norm': 2.8062386512756348, 'learning_rate': 4.678167840007767e-05, 'epoch': 3.41}\n",
-      "{'loss': 0.2347, 'grad_norm': 3.199958324432373, 'learning_rate': 4.626349532067879e-05, 'epoch': 3.43}\n",
-      "{'loss': 0.2987, 'grad_norm': 2.9405529499053955, 'learning_rate': 4.574571551302647e-05, 'epoch': 3.44}\n",
-      "{'loss': 0.2748, 'grad_norm': 2.3248393535614014, 'learning_rate': 4.522839485981994e-05, 'epoch': 3.46}\n",
-      "{'loss': 0.2595, 'grad_norm': 2.7082927227020264, 'learning_rate': 4.471158919420312e-05, 'epoch': 3.48}\n",
-      "{'loss': 0.2452, 'grad_norm': 2.636992931365967, 'learning_rate': 4.4195354293738484e-05, 'epoch': 3.5}\n",
-      "{'loss': 0.2322, 'grad_norm': 2.870598554611206, 'learning_rate': 4.367974587438733e-05, 'epoch': 3.52}\n",
-      "{'loss': 0.2822, 'grad_norm': 2.3464884757995605, 'learning_rate': 4.316481958449634e-05, 'epoch': 3.53}\n",
-      "{'loss': 0.2228, 'grad_norm': 4.499746322631836, 'learning_rate': 4.2650630998791615e-05, 'epoch': 3.55}\n",
-      "{'loss': 0.2826, 'grad_norm': 3.5622456073760986, 'learning_rate': 4.213723561238074e-05, 'epoch': 3.57}\n",
-      "{'loss': 0.2505, 'grad_norm': 2.92927622795105, 'learning_rate': 4.162468883476319e-05, 'epoch': 3.59}\n",
-      "{'loss': 0.2715, 'grad_norm': 4.32992696762085, 'learning_rate': 4.111304598385018e-05, 'epoch': 3.61}\n",
-      "{'loss': 0.2382, 'grad_norm': 3.33722186088562, 'learning_rate': 4.060236227999441e-05, 'epoch': 3.62}\n",
-      "{'loss': 0.2219, 'grad_norm': 3.15584135055542, 'learning_rate': 4.0092692840030134e-05, 'epoch': 3.64}\n",
-      "{'loss': 0.2593, 'grad_norm': 2.6653778553009033, 'learning_rate': 3.9584092671324606e-05, 'epoch': 3.66}\n",
-      "{'loss': 0.2825, 'grad_norm': 2.261251449584961, 'learning_rate': 3.907661666584131e-05, 'epoch': 3.68}\n",
-      "{'loss': 0.2472, 'grad_norm': 2.40474796295166, 'learning_rate': 3.857031959421553e-05, 'epoch': 3.69}\n",
-      "{'loss': 0.2667, 'grad_norm': 3.5820109844207764, 'learning_rate': 3.806525609984312e-05, 'epoch': 3.71}\n",
-      "{'loss': 0.2426, 'grad_norm': 4.061399459838867, 'learning_rate': 3.7561480692983006e-05, 'epoch': 3.73}\n",
-      "{'loss': 0.3113, 'grad_norm': 3.3326733112335205, 'learning_rate': 3.705904774487396e-05, 'epoch': 3.75}\n",
-      "{'loss': 0.2658, 'grad_norm': 3.946682929992676, 'learning_rate': 3.655801148186655e-05, 'epoch': 3.77}\n",
-      "{'loss': 0.2051, 'grad_norm': 1.952369213104248, 'learning_rate': 3.6058425979570485e-05, 'epoch': 3.78}\n",
-      "{'loss': 0.249, 'grad_norm': 3.139533519744873, 'learning_rate': 3.556034515701852e-05, 'epoch': 3.8}\n",
-      "{'loss': 0.2398, 'grad_norm': 2.4751150608062744, 'learning_rate': 3.506382277084696e-05, 'epoch': 3.82}\n",
-      "{'loss': 0.2659, 'grad_norm': 1.9120585918426514, 'learning_rate': 3.4568912409493945e-05, 'epoch': 3.84}\n",
-      "{'loss': 0.1847, 'grad_norm': 2.8865163326263428, 'learning_rate': 3.4075667487415785e-05, 'epoch': 3.86}\n",
-      "{'loss': 0.2245, 'grad_norm': 3.2274515628814697, 'learning_rate': 3.358414123932195e-05, 'epoch': 3.87}\n",
-      "{'loss': 0.2643, 'grad_norm': 2.924294948577881, 'learning_rate': 3.3094386714429724e-05, 'epoch': 3.89}\n",
-      "{'loss': 0.252, 'grad_norm': 3.187256336212158, 'learning_rate': 3.2606456770738636e-05, 'epoch': 3.91}\n",
-      "{'loss': 0.1969, 'grad_norm': 2.353398084640503, 'learning_rate': 3.212040406932569e-05, 'epoch': 3.93}\n",
-      "{'loss': 0.2, 'grad_norm': 2.357897996902466, 'learning_rate': 3.163628106866172e-05, 'epoch': 3.94}\n",
-      "{'loss': 0.2773, 'grad_norm': 3.165809392929077, 'learning_rate': 3.115414001894974e-05, 'epoch': 3.96}\n",
-      "{'loss': 0.2495, 'grad_norm': 3.546583414077759, 'learning_rate': 3.067403295648566e-05, 'epoch': 3.98}\n",
-      "{'loss': 0.2513, 'grad_norm': 3.0604918003082275, 'learning_rate': 3.019601169804216e-05, 'epoch': 4.0}\n",
-      " 67%|███████████████████████▎           | 2240/3360 [1:48:51<1:17:16,  4.14s/it][INFO|trainer.py:3788] 2024-07-04 23:47:35,277 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-04 23:47:35,278 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-04 23:47:35,278 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  4%|█▉                                          | 2/46 [00:00<00:02, 17.70it/s]\u001b[A\n",
-      "  9%|███▊                                        | 4/46 [00:00<00:03, 11.55it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:03, 11.20it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:00<00:03, 10.88it/s]\u001b[A\n",
-      " 22%|█████████▎                                 | 10/46 [00:00<00:03, 10.72it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:01<00:03, 10.60it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:01<00:03,  9.99it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:01<00:03,  9.65it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:01<00:03,  9.65it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:01<00:02,  9.40it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:01<00:02,  9.48it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:01<00:02,  9.53it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:02<00:02,  9.54it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:02<00:02,  9.41it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:02<00:02,  9.45it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:02<00:02,  9.51it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:02<00:02,  9.48it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:02<00:02,  9.35it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:02<00:02,  8.71it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:02<00:02,  8.77it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:02<00:01,  8.66it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:03<00:01,  8.43it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:03<00:01,  8.09it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:03<00:01,  7.41it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:03<00:01,  7.04it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:03<00:01,  6.71it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:03<00:01,  6.51it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:04<00:01,  6.30it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:04<00:01,  4.95it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:04<00:01,  4.03it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:05<00:02,  3.03it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:05<00:01,  3.01it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:05<00:01,  3.11it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:06<00:01,  3.36it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:06<00:00,  3.57it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:06<00:00,  3.79it/s]\u001b[A\n",
-      " 98%|██████████████████████████████████████████ | 45/46 [00:06<00:00,  3.73it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.564648389816284, 'eval_runtime': 7.2063, 'eval_samples_per_second': 6.383, 'eval_steps_per_second': 6.383, 'epoch': 4.0}\n",
-      " 67%|███████████████████████▎           | 2240/3360 [1:48:58<1:17:16,  4.14s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:07<00:00,  4.11it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-04 23:47:42,489 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-2240\n",
-      "[INFO|configuration_utils.py:733] 2024-07-04 23:47:44,213 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-04 23:47:44,213 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-1.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-04 23:47:44,277 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-2240/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-04 23:47:44,277 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-2240/special_tokens_map.json\n",
-      "{'loss': 0.1188, 'grad_norm': 1.1784201860427856, 'learning_rate': 2.9720127835276256e-05, 'epoch': 4.02}\n",
-      "{'loss': 0.0602, 'grad_norm': 1.9491609334945679, 'learning_rate': 2.9246432729161055e-05, 'epoch': 4.03}\n",
-      "{'loss': 0.1191, 'grad_norm': 8.893132209777832, 'learning_rate': 2.8774977504442647e-05, 'epoch': 4.05}\n",
-      "{'loss': 0.0814, 'grad_norm': 2.4567410945892334, 'learning_rate': 2.8305813044122097e-05, 'epoch': 4.07}\n",
-      "{'loss': 0.0717, 'grad_norm': 1.0190716981887817, 'learning_rate': 2.7838989983964065e-05, 'epoch': 4.09}\n",
-      "{'loss': 0.1036, 'grad_norm': 2.9603097438812256, 'learning_rate': 2.737455870703155e-05, 'epoch': 4.11}\n",
-      "{'loss': 0.0639, 'grad_norm': 0.43291687965393066, 'learning_rate': 2.6912569338248315e-05, 'epoch': 4.12}\n",
-      "{'loss': 0.1147, 'grad_norm': 16.320343017578125, 'learning_rate': 2.645307173898901e-05, 'epoch': 4.14}\n",
-      "{'loss': 0.083, 'grad_norm': 2.4415814876556396, 'learning_rate': 2.5996115501697694e-05, 'epoch': 4.16}\n",
-      "{'loss': 0.0894, 'grad_norm': 2.5519323348999023, 'learning_rate': 2.5541749944535554e-05, 'epoch': 4.18}\n",
-      "{'loss': 0.0634, 'grad_norm': 1.0128456354141235, 'learning_rate': 2.5090024106057962e-05, 'epoch': 4.19}\n",
-      "{'loss': 0.0978, 'grad_norm': 0.7708680629730225, 'learning_rate': 2.464098673992205e-05, 'epoch': 4.21}\n",
-      "{'loss': 0.0895, 'grad_norm': 2.129037618637085, 'learning_rate': 2.4194686309624663e-05, 'epoch': 4.23}\n",
-      "{'loss': 0.0986, 'grad_norm': 2.0388691425323486, 'learning_rate': 2.3751170983272e-05, 'epoch': 4.25}\n",
-      "{'loss': 0.1058, 'grad_norm': 2.9288082122802734, 'learning_rate': 2.3310488628380757e-05, 'epoch': 4.27}\n",
-      "{'loss': 0.1175, 'grad_norm': 4.13016414642334, 'learning_rate': 2.2872686806712035e-05, 'epoch': 4.28}\n",
-      "{'loss': 0.1101, 'grad_norm': 2.0640783309936523, 'learning_rate': 2.243781276913811e-05, 'epoch': 4.3}\n",
-      "{'loss': 0.0602, 'grad_norm': 2.8615546226501465, 'learning_rate': 2.200591345054267e-05, 'epoch': 4.32}\n",
-      "{'loss': 0.1019, 'grad_norm': 3.2558248043060303, 'learning_rate': 2.157703546475539e-05, 'epoch': 4.34}\n",
-      "{'loss': 0.0819, 'grad_norm': 2.1427247524261475, 'learning_rate': 2.115122509952085e-05, 'epoch': 4.36}\n",
-      "{'loss': 0.0767, 'grad_norm': 7.249903202056885, 'learning_rate': 2.0728528311502976e-05, 'epoch': 4.37}\n",
-      "{'loss': 0.0823, 'grad_norm': 2.022773027420044, 'learning_rate': 2.0308990721324927e-05, 'epoch': 4.39}\n",
-      "{'loss': 0.0797, 'grad_norm': 3.4550766944885254, 'learning_rate': 1.989265760864542e-05, 'epoch': 4.41}\n",
-      "{'loss': 0.0927, 'grad_norm': 1.1615883111953735, 'learning_rate': 1.947957390727185e-05, 'epoch': 4.43}\n",
-      "{'loss': 0.0782, 'grad_norm': 3.103994607925415, 'learning_rate': 1.906978420031059e-05, 'epoch': 4.44}\n",
-      "{'loss': 0.0575, 'grad_norm': 1.6370556354522705, 'learning_rate': 1.8663332715355396e-05, 'epoch': 4.46}\n",
-      "{'loss': 0.1022, 'grad_norm': 1.106717824935913, 'learning_rate': 1.8260263319713844e-05, 'epoch': 4.48}\n",
-      "{'loss': 0.1071, 'grad_norm': 3.171022415161133, 'learning_rate': 1.7860619515673033e-05, 'epoch': 4.5}\n",
-      "{'loss': 0.1038, 'grad_norm': 1.9004364013671875, 'learning_rate': 1.746444443580433e-05, 'epoch': 4.52}\n",
-      "{'loss': 0.0836, 'grad_norm': 1.7966681718826294, 'learning_rate': 1.7071780838308288e-05, 'epoch': 4.53}\n",
-      "{'loss': 0.0773, 'grad_norm': 2.2593512535095215, 'learning_rate': 1.6682671102399805e-05, 'epoch': 4.55}\n",
-      "{'loss': 0.0671, 'grad_norm': 2.4209578037261963, 'learning_rate': 1.629715722373423e-05, 'epoch': 4.57}\n",
-      "{'loss': 0.0869, 'grad_norm': 3.6910362243652344, 'learning_rate': 1.5915280809874932e-05, 'epoch': 4.59}\n",
-      "{'loss': 0.0713, 'grad_norm': 2.8420000076293945, 'learning_rate': 1.553708307580265e-05, 'epoch': 4.61}\n",
-      "{'loss': 0.0886, 'grad_norm': 1.897133469581604, 'learning_rate': 1.5162604839467265e-05, 'epoch': 4.62}\n",
-      "{'loss': 0.0804, 'grad_norm': 2.0078957080841064, 'learning_rate': 1.4791886517382413e-05, 'epoch': 4.64}\n",
-      "{'loss': 0.0828, 'grad_norm': 2.6949617862701416, 'learning_rate': 1.4424968120263504e-05, 'epoch': 4.66}\n",
-      "{'loss': 0.0906, 'grad_norm': 2.1701433658599854, 'learning_rate': 1.4061889248709343e-05, 'epoch': 4.68}\n",
-      "{'loss': 0.0854, 'grad_norm': 2.741436004638672, 'learning_rate': 1.370268908892825e-05, 'epoch': 4.69}\n",
-      "{'loss': 0.0847, 'grad_norm': 1.9649664163589478, 'learning_rate': 1.3347406408508695e-05, 'epoch': 4.71}\n",
-      "{'loss': 0.1074, 'grad_norm': 2.995682716369629, 'learning_rate': 1.2996079552235263e-05, 'epoch': 4.73}\n",
-      "{'loss': 0.0675, 'grad_norm': 1.7899149656295776, 'learning_rate': 1.264874643795021e-05, 'epoch': 4.75}\n",
-      "{'loss': 0.0736, 'grad_norm': 3.165422201156616, 'learning_rate': 1.230544455246101e-05, 'epoch': 4.77}\n",
-      "{'loss': 0.0949, 'grad_norm': 3.376789093017578, 'learning_rate': 1.1966210947494583e-05, 'epoch': 4.78}\n",
-      "{'loss': 0.0774, 'grad_norm': 0.7393803000450134, 'learning_rate': 1.1631082235698316e-05, 'epoch': 4.8}\n",
-      "{'loss': 0.0685, 'grad_norm': 4.275796890258789, 'learning_rate': 1.130009458668863e-05, 'epoch': 4.82}\n",
-      "{'loss': 0.0642, 'grad_norm': 1.65122652053833, 'learning_rate': 1.097328372314721e-05, 'epoch': 4.84}\n",
-      "{'loss': 0.0855, 'grad_norm': 1.4425795078277588, 'learning_rate': 1.0650684916965559e-05, 'epoch': 4.85}\n",
-      "{'loss': 0.0883, 'grad_norm': 2.1447832584381104, 'learning_rate': 1.0332332985438248e-05, 'epoch': 4.87}\n",
-      "{'loss': 0.1137, 'grad_norm': 2.644052743911743, 'learning_rate': 1.0018262287505086e-05, 'epoch': 4.89}\n",
-      "{'loss': 0.1026, 'grad_norm': 0.3625916838645935, 'learning_rate': 9.708506720042932e-06, 'epoch': 4.91}\n",
-      "{'loss': 0.0708, 'grad_norm': 0.9670233130455017, 'learning_rate': 9.403099714207175e-06, 'epoch': 4.93}\n",
-      "{'loss': 0.0886, 'grad_norm': 1.222226619720459, 'learning_rate': 9.102074231823727e-06, 'epoch': 4.94}\n",
-      "{'loss': 0.0913, 'grad_norm': 1.5419262647628784, 'learning_rate': 8.805462761831418e-06, 'epoch': 4.96}\n",
-      "{'loss': 0.105, 'grad_norm': 1.7759844064712524, 'learning_rate': 8.513297316775625e-06, 'epoch': 4.98}\n",
-      "{'loss': 0.0818, 'grad_norm': 1.2991019487380981, 'learning_rate': 8.225609429353187e-06, 'epoch': 5.0}\n",
-      " 83%|██████████████████████████████▊      | 2800/3360 [2:24:53<36:03,  3.86s/it][INFO|trainer.py:3788] 2024-07-05 00:23:37,381 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-05 00:23:37,382 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-05 00:23:37,382 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:02, 15.46it/s]\u001b[A\n",
-      " 11%|████▊                                       | 5/46 [00:00<00:03, 12.42it/s]\u001b[A\n",
-      " 15%|██████▋                                     | 7/46 [00:00<00:03, 11.09it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:00<00:03, 10.00it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:01<00:03,  9.99it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:01<00:03,  9.77it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:01<00:03,  9.58it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:01<00:03,  9.25it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:01<00:03,  8.92it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:01<00:03,  8.72it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:01<00:03,  8.10it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:02<00:03,  7.66it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:02<00:03,  7.21it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:02<00:03,  7.20it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:02<00:03,  6.78it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:02<00:03,  6.77it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:02<00:03,  6.86it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:02<00:03,  6.94it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:03<00:02,  6.87it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:03<00:02,  6.55it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:03<00:02,  6.19it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:03<00:02,  6.08it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:03<00:02,  5.95it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:03<00:02,  5.75it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:04<00:02,  5.04it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:04<00:02,  4.73it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:04<00:02,  4.79it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:04<00:02,  4.60it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:05<00:01,  5.26it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:05<00:01,  5.95it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:05<00:01,  6.72it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:05<00:00,  7.27it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:05<00:00,  7.76it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:05<00:00,  8.19it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:05<00:00,  8.96it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 3.000229835510254, 'eval_runtime': 6.222, 'eval_samples_per_second': 7.393, 'eval_steps_per_second': 7.393, 'epoch': 5.0}\n",
-      " 83%|██████████████████████████████▊      | 2800/3360 [2:24:59<36:03,  3.86s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:06<00:00,  8.63it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-05 00:23:43,607 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-2800\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 00:23:45,000 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 00:23:45,001 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-1.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-05 00:23:45,087 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-2800/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-05 00:23:45,087 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-2800/special_tokens_map.json\n",
-      "{'loss': 0.0391, 'grad_norm': 1.8985695838928223, 'learning_rate': 7.942430149009161e-06, 'epoch': 5.02}\n",
-      "{'loss': 0.0262, 'grad_norm': 0.18104498088359833, 'learning_rate': 7.663790038585793e-06, 'epoch': 5.03}\n",
-      "{'loss': 0.0369, 'grad_norm': 0.4857228696346283, 'learning_rate': 7.389719171023857e-06, 'epoch': 5.05}\n",
-      "{'loss': 0.0285, 'grad_norm': 0.5048622488975525, 'learning_rate': 7.1202471261170245e-06, 'epoch': 5.07}\n",
-      "{'loss': 0.0239, 'grad_norm': 1.3091479539871216, 'learning_rate': 6.855402987319348e-06, 'epoch': 5.09}\n",
-      "{'loss': 0.0315, 'grad_norm': 0.7383649945259094, 'learning_rate': 6.595215338606397e-06, 'epoch': 5.1}\n",
-      "{'loss': 0.0227, 'grad_norm': 0.46847808361053467, 'learning_rate': 6.339712261390213e-06, 'epoch': 5.12}\n",
-      "{'loss': 0.0286, 'grad_norm': 2.871511936187744, 'learning_rate': 6.088921331488568e-06, 'epoch': 5.14}\n",
-      "{'loss': 0.0215, 'grad_norm': 0.5253076553344727, 'learning_rate': 5.8428696161488215e-06, 'epoch': 5.16}\n",
-      "{'loss': 0.0212, 'grad_norm': 0.7373698949813843, 'learning_rate': 5.601583671126531e-06, 'epoch': 5.18}\n",
-      "{'loss': 0.0468, 'grad_norm': 1.2003121376037598, 'learning_rate': 5.365089537819434e-06, 'epoch': 5.19}\n",
-      "{'loss': 0.0269, 'grad_norm': 0.1384514421224594, 'learning_rate': 5.133412740456806e-06, 'epoch': 5.21}\n",
-      "{'loss': 0.016, 'grad_norm': 0.6597172617912292, 'learning_rate': 4.906578283344759e-06, 'epoch': 5.23}\n",
-      "{'loss': 0.0273, 'grad_norm': 1.3373147249221802, 'learning_rate': 4.684610648167503e-06, 'epoch': 5.25}\n",
-      "{'loss': 0.022, 'grad_norm': 1.9218050241470337, 'learning_rate': 4.467533791345191e-06, 'epoch': 5.27}\n",
-      "{'loss': 0.0266, 'grad_norm': 0.33371880650520325, 'learning_rate': 4.255371141448272e-06, 'epoch': 5.28}\n",
-      "{'loss': 0.0246, 'grad_norm': 0.3639131486415863, 'learning_rate': 4.048145596668967e-06, 'epoch': 5.3}\n",
-      "{'loss': 0.04, 'grad_norm': 0.7324997186660767, 'learning_rate': 3.84587952234991e-06, 'epoch': 5.32}\n",
-      "{'loss': 0.02, 'grad_norm': 1.7712045907974243, 'learning_rate': 3.6485947485702832e-06, 'epoch': 5.34}\n",
-      "{'loss': 0.0304, 'grad_norm': 1.001847267150879, 'learning_rate': 3.4563125677897932e-06, 'epoch': 5.35}\n",
-      "{'loss': 0.0251, 'grad_norm': 1.4244178533554077, 'learning_rate': 3.269053732550581e-06, 'epoch': 5.37}\n",
-      "{'loss': 0.0201, 'grad_norm': 0.938901960849762, 'learning_rate': 3.086838453237506e-06, 'epoch': 5.39}\n",
-      "{'loss': 0.017, 'grad_norm': 0.722439706325531, 'learning_rate': 2.9096863958968268e-06, 'epoch': 5.41}\n",
-      "{'loss': 0.0278, 'grad_norm': 0.9856802225112915, 'learning_rate': 2.737616680113758e-06, 'epoch': 5.43}\n",
-      "{'loss': 0.0275, 'grad_norm': 1.7459590435028076, 'learning_rate': 2.570647876948895e-06, 'epoch': 5.44}\n",
-      "{'loss': 0.0419, 'grad_norm': 15.734712600708008, 'learning_rate': 2.408798006933882e-06, 'epoch': 5.46}\n",
-      "{'loss': 0.0498, 'grad_norm': 0.5652347207069397, 'learning_rate': 2.252084538126542e-06, 'epoch': 5.48}\n",
-      "{'loss': 0.0281, 'grad_norm': 0.6292805075645447, 'learning_rate': 2.100524384225555e-06, 'epoch': 5.5}\n",
-      "{'loss': 0.025, 'grad_norm': 1.3762198686599731, 'learning_rate': 1.9541339027450256e-06, 'epoch': 5.52}\n",
-      "{'loss': 0.0228, 'grad_norm': 0.6231855154037476, 'learning_rate': 1.8129288932490274e-06, 'epoch': 5.53}\n",
-      "{'loss': 0.021, 'grad_norm': 0.2345045506954193, 'learning_rate': 1.6769245956464396e-06, 'epoch': 5.55}\n",
-      "{'loss': 0.0314, 'grad_norm': 0.8907411694526672, 'learning_rate': 1.5461356885461075e-06, 'epoch': 5.57}\n",
-      "{'loss': 0.0324, 'grad_norm': 0.8636724948883057, 'learning_rate': 1.4205762876726092e-06, 'epoch': 5.59}\n",
-      "{'loss': 0.0306, 'grad_norm': 1.4055633544921875, 'learning_rate': 1.3002599443428243e-06, 'epoch': 5.6}\n",
-      "{'loss': 0.0276, 'grad_norm': 0.9670897722244263, 'learning_rate': 1.1851996440033319e-06, 'epoch': 5.62}\n",
-      "{'loss': 0.0328, 'grad_norm': 0.16922369599342346, 'learning_rate': 1.0754078048289374e-06, 'epoch': 5.64}\n",
-      "{'loss': 0.031, 'grad_norm': 1.8827847242355347, 'learning_rate': 9.708962763824048e-07, 'epoch': 5.66}\n",
-      "{'loss': 0.0214, 'grad_norm': 0.40066924691200256, 'learning_rate': 8.716763383355864e-07, 'epoch': 5.68}\n",
-      "{'loss': 0.0272, 'grad_norm': 0.28809547424316406, 'learning_rate': 7.777586992519959e-07, 'epoch': 5.69}\n",
-      "{'loss': 0.0253, 'grad_norm': 1.053158164024353, 'learning_rate': 6.891534954310885e-07, 'epoch': 5.71}\n",
-      "{'loss': 0.025, 'grad_norm': 0.2853540778160095, 'learning_rate': 6.058702898142643e-07, 'epoch': 5.73}\n",
-      "{'loss': 0.0354, 'grad_norm': 1.2035536766052246, 'learning_rate': 5.279180709527765e-07, 'epoch': 5.75}\n",
-      "{'loss': 0.0276, 'grad_norm': 0.9827560782432556, 'learning_rate': 4.553052520375911e-07, 'epoch': 5.77}\n",
-      "{'loss': 0.0209, 'grad_norm': 0.42196208238601685, 'learning_rate': 3.8803966999139684e-07, 'epoch': 5.78}\n",
-      "{'loss': 0.0265, 'grad_norm': 1.0920729637145996, 'learning_rate': 3.261285846227868e-07, 'epoch': 5.8}\n",
-      "{'loss': 0.0218, 'grad_norm': 0.4562773108482361, 'learning_rate': 2.6957867784270787e-07, 'epoch': 5.82}\n",
-      "{'loss': 0.0229, 'grad_norm': 1.235041618347168, 'learning_rate': 2.1839605294330933e-07, 'epoch': 5.84}\n",
-      "{'loss': 0.0371, 'grad_norm': 0.8272603154182434, 'learning_rate': 1.725862339392259e-07, 'epoch': 5.85}\n",
-      "{'loss': 0.0187, 'grad_norm': 0.5107071399688721, 'learning_rate': 1.3215416497138754e-07, 'epoch': 5.87}\n",
-      "{'loss': 0.0347, 'grad_norm': 1.0998457670211792, 'learning_rate': 9.710420977340762e-08, 'epoch': 5.89}\n",
-      "{'loss': 0.027, 'grad_norm': 1.8781795501708984, 'learning_rate': 6.744015120061509e-08, 'epoch': 5.91}\n",
-      "{'loss': 0.0351, 'grad_norm': 0.9750437140464783, 'learning_rate': 4.316519082179227e-08, 'epoch': 5.93}\n",
-      "{'loss': 0.0209, 'grad_norm': 1.2990669012069702, 'learning_rate': 2.4281948573617874e-08, 'epoch': 5.94}\n",
-      "{'loss': 0.0354, 'grad_norm': 1.9354966878890991, 'learning_rate': 1.0792462477909882e-08, 'epoch': 5.96}\n",
-      "{'loss': 0.0381, 'grad_norm': 1.044374704360962, 'learning_rate': 2.6981884216847884e-09, 'epoch': 5.98}\n",
-      "{'loss': 0.0228, 'grad_norm': 0.6751245856285095, 'learning_rate': 0.0, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [3:00:43<00:00,  3.75s/it][INFO|trainer.py:3788] 2024-07-05 00:59:27,574 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-05 00:59:27,574 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-05 00:59:27,574 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:02, 15.75it/s]\u001b[A\n",
-      " 11%|████▊                                       | 5/46 [00:00<00:03, 12.78it/s]\u001b[A\n",
-      " 15%|██████▋                                     | 7/46 [00:00<00:03, 11.87it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:00<00:03, 11.54it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:00<00:03, 11.21it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:01<00:03, 10.94it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:01<00:02, 10.42it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:01<00:02, 10.38it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:01<00:02,  9.50it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:01<00:02,  9.13it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:02<00:02,  9.21it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:02<00:02,  9.18it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:02<00:02,  9.02it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:02<00:02,  8.78it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:02<00:02,  8.65it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:02<00:02,  8.41it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:02<00:02,  7.43it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:02<00:02,  7.47it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:03<00:02,  7.36it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:03<00:02,  7.17it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:03<00:02,  6.67it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:03<00:02,  5.32it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:03<00:02,  4.79it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:04<00:02,  4.36it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:04<00:02,  4.02it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:04<00:02,  4.26it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:04<00:01,  4.76it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:04<00:01,  5.19it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:05<00:01,  5.53it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:05<00:01,  5.88it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:05<00:00,  6.21it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:05<00:00,  6.42it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:05<00:00,  6.63it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:05<00:00,  6.74it/s]\u001b[A\n",
-      " 98%|██████████████████████████████████████████ | 45/46 [00:06<00:00,  6.80it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 3.4013702869415283, 'eval_runtime': 6.2741, 'eval_samples_per_second': 7.332, 'eval_steps_per_second': 7.332, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [3:00:49<00:00,  3.75s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:06<00:00,  6.66it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-05 00:59:33,853 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft/checkpoint-3360\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 00:59:35,314 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 00:59:35,316 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-1.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-05 00:59:35,381 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/checkpoint-3360/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-05 00:59:35,382 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/checkpoint-3360/special_tokens_map.json\n",
-      "[INFO|<string>:482] 2024-07-05 00:59:35,695 >> \n",
-      "\n",
-      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
-      "\n",
-      "\n",
-      "{'train_runtime': 10857.6726, 'train_samples_per_second': 2.477, 'train_steps_per_second': 0.309, 'train_loss': 0.6667878782021858, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [3:00:51<00:00,  3.23s/it]\n",
-      "[INFO|trainer.py:3478] 2024-07-05 00:59:35,700 >> Saving model checkpoint to saves/qwen2-1.5b/lora/sft\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 00:59:36,890 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-1.5b-instruct-bnb-4bit/snapshots/9f10684b3a26fbf25e50921655353e2e3e599d70/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 00:59:36,891 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-1.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 1536,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 8960,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 28,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 28,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|tokenization_utils_base.py:2574] 2024-07-05 00:59:36,947 >> tokenizer config file saved in saves/qwen2-1.5b/lora/sft/tokenizer_config.json\n",
-      "[INFO|tokenization_utils_base.py:2583] 2024-07-05 00:59:36,947 >> Special tokens file saved in saves/qwen2-1.5b/lora/sft/special_tokens_map.json\n",
-      "***** train metrics *****\n",
-      "  epoch                    =     5.9973\n",
-      "  total_flos               = 19692141GF\n",
-      "  train_loss               =     0.6668\n",
-      "  train_runtime            = 3:00:57.67\n",
-      "  train_samples_per_second =      2.477\n",
-      "  train_steps_per_second   =      0.309\n",
-      "Figure saved at: saves/qwen2-1.5b/lora/sft/training_loss.png\n",
-      "Figure saved at: saves/qwen2-1.5b/lora/sft/training_eval_loss.png\n",
-      "[INFO|trainer.py:3788] 2024-07-05 00:59:37,341 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-05 00:59:37,341 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-05 00:59:37,341 >>   Batch size = 1\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:05<00:00,  7.96it/s]\n",
-      "***** eval metrics *****\n",
-      "  epoch                   =     5.9973\n",
-      "  eval_loss               =     3.4014\n",
-      "  eval_runtime            = 0:00:05.94\n",
-      "  eval_samples_per_second =      7.742\n",
-      "  eval_steps_per_second   =      7.742\n",
-      "[INFO|modelcard.py:449] 2024-07-05 00:59:43,285 >> Dropping the following result as it does not have all the necessary fields:\n",
-      "{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: | 0.091 MB of 0.091 MB uploaded\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               eval/loss ▁▁▃▄▆██\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:            eval/runtime ▂▁▆█▆▆▅\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: eval/samples_per_second ▆█▂▁▂▂▃\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:   eval/steps_per_second ▆█▂▁▂▂▃\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:             train/epoch ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:       train/global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:         train/grad_norm ▂▁▁▂▂▂▂▂▂▂▂▂▂▂▂▃▃▂▂▃▂▂▂▂▂▂▂█▃▁▂▂▁▁▁▁▁▁▁▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:     train/learning_rate ▂▄▅▇██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/loss ████▇▇▇▅▆▆▅▅▅▅▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:                eval/loss 3.40137\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:             eval/runtime 5.9413\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:  eval/samples_per_second 7.742\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:    eval/steps_per_second 7.742\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               total_flos 2.114427607798579e+16\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/epoch 5.99732\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:        train/global_step 3360\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:          train/grad_norm 0.67512\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:      train/learning_rate 0.0\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               train/loss 0.0228\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               train_loss 0.66679\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:            train_runtime 10857.6726\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: train_samples_per_second 2.477\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:   train_steps_per_second 0.309\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mqwen2_1.5b_lora_sft\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface/runs/4fbnqsea\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at: \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 6 W&B file(s), 0 media file(s), 1 artifact file(s) and 0 other file(s)\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240704_215839-4fbnqsea/logs\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The new W&B backend becomes opt-out in version 0.18.0; try it out with `wandb.require(\"core\")`! See https://wandb.me/wandb-core for more information.\n",
-      "CPU times: user 3min 32s, sys: 1min 10s, total: 4min 43s\n",
-      "Wall time: 3h 3min 14s\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "\n",
-    "!./scripts/tune-lf.sh config/qwen2_1.5b_lora_sft_unsloth.yaml"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Current Directory:\n",
-      "/home/inflaton/code/projects/courses/llm-finetuning/llama-factory\n",
-      "07/05/2024 06:15:40 - WARNING - llamafactory.hparams.parser - We recommend enable `upcast_layernorm` in quantized training.\n",
-      "07/05/2024 06:15:40 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-05 06:15:40,695 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/vocab.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-05 06:15:40,695 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/merges.txt\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-05 06:15:40,695 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/tokenizer.json\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-05 06:15:40,695 >> loading file added_tokens.json from cache at None\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-05 06:15:40,695 >> loading file special_tokens_map.json from cache at None\n",
-      "[INFO|tokenization_utils_base.py:2161] 2024-07-05 06:15:40,695 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/tokenizer_config.json\n",
-      "[WARNING|logging.py:313] 2024-07-05 06:15:40,871 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-      "07/05/2024 06:15:40 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>\n",
-      "07/05/2024 06:15:40 - INFO - llamafactory.data.template - Add <|im_start|> to stop words.\n",
-      "07/05/2024 06:15:40 - INFO - llamafactory.data.loader - Loading dataset alpaca_mac.json...\n",
-      "Converting format of dataset (num_proc=16): 100%|█| 4528/4528 [00:00<00:00, 1717\n",
-      "Running tokenizer on dataset (num_proc=16): 100%|█| 4528/4528 [00:01<00:00, 2570\n",
-      "input_ids:\n",
-      "[151644, 872, 198, 5501, 14683, 279, 2701, 8453, 1467, 1119, 6364, 323, 3410, 1172, 279, 24531, 2213, 11, 4302, 770, 624, 35987, 102895, 99164, 100324, 100717, 100095, 99509, 1773, 151645, 198, 151644, 77091, 198, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n",
-      "inputs:\n",
-      "<|im_start|>user\n",
-      "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n",
-      "全仗着狐仙搭救。<|im_end|>\n",
-      "<|im_start|>assistant\n",
-      "Because I was protected by a fox fairy.<|im_end|>\n",
-      "label_ids:\n",
-      "[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 17949, 358, 572, 2617, 553, 264, 38835, 44486, 13, 151645]\n",
-      "labels:\n",
-      "Because I was protected by a fox fairy.<|im_end|>\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 06:15:44,437 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2-0.5B-Instruct/snapshots/c291d6fce4804a1d39305f388dd32897d1f7acc4/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 06:15:44,438 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-0.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "07/05/2024 06:15:44 - INFO - llamafactory.model.model_utils.quantization - Quantizing model to 4 bit with bitsandbytes.\n",
-      "🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 06:15:45,429 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-0.5b-instruct-bnb-4bit/snapshots/c3b24ce4827d69f5c3bde9aba00047774069ab72/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 06:15:45,430 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"unsloth/qwen2-0.5b-instruct-bnb-4bit\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "==((====))==  Unsloth: Fast Qwen2 patching release 2024.7\n",
-      "   \\\\   /|    GPU: NVIDIA GeForce RTX 4080 Laptop GPU. Max memory: 11.994 GB. Platform = Linux.\n",
-      "O^O/ \\_/ \\    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.\n",
-      "\\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]\n",
-      " \"-____-\"     Free Apache license: http://github.com/unslothai/unsloth\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 06:15:46,517 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-0.5b-instruct-bnb-4bit/snapshots/c3b24ce4827d69f5c3bde9aba00047774069ab72/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 06:15:46,517 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"unsloth/qwen2-0.5b-instruct-bnb-4bit\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 06:15:47,071 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-0.5b-instruct-bnb-4bit/snapshots/c3b24ce4827d69f5c3bde9aba00047774069ab72/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 06:15:47,071 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"unsloth/qwen2-0.5b-instruct-bnb-4bit\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|modeling_utils.py:3556] 2024-07-05 06:15:47,115 >> loading weights file model.safetensors from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-0.5b-instruct-bnb-4bit/snapshots/c3b24ce4827d69f5c3bde9aba00047774069ab72/model.safetensors\n",
-      "[INFO|modeling_utils.py:1531] 2024-07-05 06:15:48,951 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.\n",
-      "[INFO|configuration_utils.py:1000] 2024-07-05 06:15:48,969 >> Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645\n",
-      "}\n",
-      "\n",
-      "[INFO|modeling_utils.py:4364] 2024-07-05 06:16:14,443 >> All model checkpoint weights were used when initializing Qwen2ForCausalLM.\n",
-      "\n",
-      "[INFO|modeling_utils.py:4372] 2024-07-05 06:16:14,443 >> All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at unsloth/qwen2-0.5b-instruct-bnb-4bit.\n",
-      "If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.\n",
-      "[INFO|configuration_utils.py:955] 2024-07-05 06:16:14,971 >> loading configuration file generation_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-0.5b-instruct-bnb-4bit/snapshots/c3b24ce4827d69f5c3bde9aba00047774069ab72/generation_config.json\n",
-      "[INFO|configuration_utils.py:1000] 2024-07-05 06:16:14,971 >> Generate config GenerationConfig {\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"do_sample\": true,\n",
-      "  \"eos_token_id\": [\n",
-      "    151645,\n",
-      "    151643\n",
-      "  ],\n",
-      "  \"pad_token_id\": 151643,\n",
-      "  \"repetition_penalty\": 1.1,\n",
-      "  \"temperature\": 0.7,\n",
-      "  \"top_k\": 20,\n",
-      "  \"top_p\": 0.8\n",
-      "}\n",
-      "\n",
-      "07/05/2024 06:16:18 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.\n",
-      "07/05/2024 06:16:18 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.\n",
-      "07/05/2024 06:16:18 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA\n",
-      "07/05/2024 06:16:18 - INFO - llamafactory.model.model_utils.misc - Found linear modules: gate_proj,q_proj,k_proj,up_proj,down_proj,o_proj,v_proj\n",
-      "[WARNING|logging.py:328] 2024-07-05 06:16:19,091 >> Unsloth 2024.7 patched 24 layers with 0 QKV layers, 24 O layers and 24 MLP layers.\n",
-      "07/05/2024 06:16:19 - INFO - llamafactory.model.loader - trainable params: 4,399,104 || all params: 634,566,528 || trainable%: 0.6932\n",
-      "[INFO|trainer.py:642] 2024-07-05 06:16:19,940 >> Using auto half precision backend\n",
-      "07/05/2024 06:16:19 - WARNING - llamafactory.train.callbacks - Previous trainer log in this folder will be deleted.\n",
-      "07/05/2024 06:16:20 - INFO - llamafactory.train.trainer_utils - Using LoRA+ optimizer with loraplus lr ratio 16.00.\n",
-      "[WARNING|<string>:223] 2024-07-05 06:16:20,129 >> ==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1\n",
-      "   \\\\   /|    Num examples = 4,482 | Num Epochs = 6\n",
-      "O^O/ \\_/ \\    Batch size per device = 1 | Gradient Accumulation steps = 8\n",
-      "\\        /    Total batch size = 8 | Total steps = 3,360\n",
-      " \"-____-\"     Number of trainable parameters = 4,399,104\n",
-      "[INFO|integration_utils.py:750] 2024-07-05 06:16:20,818 >> Automatic Weights & Biases logging enabled, to disable set os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33minflaton-sg\u001b[0m (\u001b[33minflaton-ai\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.17.4\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m/home/inflaton/code/projects/courses/llm-finetuning/llama-factory/wandb/run-20240705_061623-3amepb0m\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mqwen2_0.5b_lora_sft\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface/runs/3amepb0m\u001b[0m\n",
-      "{'loss': 2.6325, 'grad_norm': 2.6052567958831787, 'learning_rate': 2.9761904761904763e-06, 'epoch': 0.02}\n",
-      "{'loss': 2.6514, 'grad_norm': 2.433773994445801, 'learning_rate': 5.9523809523809525e-06, 'epoch': 0.04}\n",
-      "{'loss': 2.474, 'grad_norm': 2.1471617221832275, 'learning_rate': 8.92857142857143e-06, 'epoch': 0.05}\n",
-      "{'loss': 2.3031, 'grad_norm': 4.300695419311523, 'learning_rate': 1.1904761904761905e-05, 'epoch': 0.07}\n",
-      "{'loss': 2.4774, 'grad_norm': 1.8105831146240234, 'learning_rate': 1.4880952380952381e-05, 'epoch': 0.09}\n",
-      "{'loss': 2.2519, 'grad_norm': 2.077115297317505, 'learning_rate': 1.785714285714286e-05, 'epoch': 0.11}\n",
-      "{'loss': 2.4309, 'grad_norm': 1.9538270235061646, 'learning_rate': 2.0833333333333336e-05, 'epoch': 0.12}\n",
-      "{'loss': 2.22, 'grad_norm': 2.1473119258880615, 'learning_rate': 2.380952380952381e-05, 'epoch': 0.14}\n",
-      "{'loss': 2.3228, 'grad_norm': 2.819317579269409, 'learning_rate': 2.6785714285714288e-05, 'epoch': 0.16}\n",
-      "{'loss': 2.238, 'grad_norm': 1.9084508419036865, 'learning_rate': 2.9761904761904762e-05, 'epoch': 0.18}\n",
-      "{'loss': 2.2707, 'grad_norm': 2.1343274116516113, 'learning_rate': 3.273809523809524e-05, 'epoch': 0.2}\n",
-      "{'loss': 2.286, 'grad_norm': 2.273739814758301, 'learning_rate': 3.571428571428572e-05, 'epoch': 0.21}\n",
-      "{'loss': 2.1805, 'grad_norm': 2.505805253982544, 'learning_rate': 3.8690476190476195e-05, 'epoch': 0.23}\n",
-      "{'loss': 2.2527, 'grad_norm': 2.4992618560791016, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.25}\n",
-      "{'loss': 2.1387, 'grad_norm': 1.9521129131317139, 'learning_rate': 4.464285714285715e-05, 'epoch': 0.27}\n",
-      "{'loss': 2.1733, 'grad_norm': 1.7223074436187744, 'learning_rate': 4.761904761904762e-05, 'epoch': 0.29}\n",
-      "{'loss': 2.2774, 'grad_norm': 1.8748223781585693, 'learning_rate': 5.05952380952381e-05, 'epoch': 0.3}\n",
-      "{'loss': 2.0726, 'grad_norm': 2.039461135864258, 'learning_rate': 5.3571428571428575e-05, 'epoch': 0.32}\n",
-      "{'loss': 2.1471, 'grad_norm': 2.512571096420288, 'learning_rate': 5.6547619047619046e-05, 'epoch': 0.34}\n",
-      "{'loss': 2.3088, 'grad_norm': 2.0730302333831787, 'learning_rate': 5.9523809523809524e-05, 'epoch': 0.36}\n",
-      "{'loss': 2.2315, 'grad_norm': 1.9101688861846924, 'learning_rate': 6.25e-05, 'epoch': 0.37}\n",
-      "{'loss': 2.1767, 'grad_norm': 2.6846179962158203, 'learning_rate': 6.547619047619048e-05, 'epoch': 0.39}\n",
-      "{'loss': 2.1396, 'grad_norm': 2.3576760292053223, 'learning_rate': 6.845238095238096e-05, 'epoch': 0.41}\n",
-      "{'loss': 2.3496, 'grad_norm': 2.5166685581207275, 'learning_rate': 7.142857142857143e-05, 'epoch': 0.43}\n",
-      "{'loss': 2.1899, 'grad_norm': 2.326274871826172, 'learning_rate': 7.440476190476191e-05, 'epoch': 0.45}\n",
-      "{'loss': 2.1658, 'grad_norm': 2.342203140258789, 'learning_rate': 7.738095238095239e-05, 'epoch': 0.46}\n",
-      "{'loss': 2.1561, 'grad_norm': 2.895669937133789, 'learning_rate': 8.035714285714287e-05, 'epoch': 0.48}\n",
-      "{'loss': 2.264, 'grad_norm': 3.2078170776367188, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.5}\n",
-      "{'loss': 2.088, 'grad_norm': 2.282803773880005, 'learning_rate': 8.630952380952382e-05, 'epoch': 0.52}\n",
-      "{'loss': 2.1821, 'grad_norm': 2.5930910110473633, 'learning_rate': 8.92857142857143e-05, 'epoch': 0.54}\n",
-      "{'loss': 2.2382, 'grad_norm': 2.7073450088500977, 'learning_rate': 9.226190476190478e-05, 'epoch': 0.55}\n",
-      "{'loss': 2.0117, 'grad_norm': 3.457638740539551, 'learning_rate': 9.523809523809524e-05, 'epoch': 0.57}\n",
-      "{'loss': 2.0526, 'grad_norm': 3.453278064727783, 'learning_rate': 9.821428571428572e-05, 'epoch': 0.59}\n",
-      "{'loss': 2.1403, 'grad_norm': 2.7960667610168457, 'learning_rate': 9.999956828659095e-05, 'epoch': 0.61}\n",
-      "{'loss': 2.161, 'grad_norm': 3.307030439376831, 'learning_rate': 9.999471159635539e-05, 'epoch': 0.62}\n",
-      "{'loss': 2.0478, 'grad_norm': 2.788396120071411, 'learning_rate': 9.998445910004082e-05, 'epoch': 0.64}\n",
-      "{'loss': 2.3267, 'grad_norm': 4.489534378051758, 'learning_rate': 9.996881190417393e-05, 'epoch': 0.66}\n",
-      "{'loss': 2.2085, 'grad_norm': 2.93642520904541, 'learning_rate': 9.994777169751806e-05, 'epoch': 0.68}\n",
-      "{'loss': 1.8982, 'grad_norm': 2.470207929611206, 'learning_rate': 9.992134075089084e-05, 'epoch': 0.7}\n",
-      "{'loss': 2.1388, 'grad_norm': 2.992520809173584, 'learning_rate': 9.988952191691925e-05, 'epoch': 0.71}\n",
-      "{'loss': 2.1675, 'grad_norm': 2.986842155456543, 'learning_rate': 9.985231862973168e-05, 'epoch': 0.73}\n",
-      "{'loss': 2.1914, 'grad_norm': 2.8504011631011963, 'learning_rate': 9.980973490458728e-05, 'epoch': 0.75}\n",
-      "{'loss': 2.1588, 'grad_norm': 3.4979565143585205, 'learning_rate': 9.976177533744261e-05, 'epoch': 0.77}\n",
-      "{'loss': 2.0952, 'grad_norm': 3.6922664642333984, 'learning_rate': 9.97084451044556e-05, 'epoch': 0.79}\n",
-      "{'loss': 2.0288, 'grad_norm': 2.895118236541748, 'learning_rate': 9.964974996142698e-05, 'epoch': 0.8}\n",
-      "{'loss': 2.1275, 'grad_norm': 3.1226203441619873, 'learning_rate': 9.958569624317893e-05, 'epoch': 0.82}\n",
-      "{'loss': 2.1303, 'grad_norm': 4.210818767547607, 'learning_rate': 9.951629086287151e-05, 'epoch': 0.84}\n",
-      "{'loss': 2.1294, 'grad_norm': 2.9749433994293213, 'learning_rate': 9.944154131125642e-05, 'epoch': 0.86}\n",
-      "{'loss': 2.1612, 'grad_norm': 2.9232656955718994, 'learning_rate': 9.936145565586871e-05, 'epoch': 0.87}\n",
-      "{'loss': 2.3294, 'grad_norm': 2.8355772495269775, 'learning_rate': 9.927604254015585e-05, 'epoch': 0.89}\n",
-      "{'loss': 2.274, 'grad_norm': 3.1120338439941406, 'learning_rate': 9.918531118254507e-05, 'epoch': 0.91}\n",
-      "{'loss': 2.1442, 'grad_norm': 4.310208797454834, 'learning_rate': 9.90892713754483e-05, 'epoch': 0.93}\n",
-      "{'loss': 2.1595, 'grad_norm': 3.8621461391448975, 'learning_rate': 9.898793348420536e-05, 'epoch': 0.95}\n",
-      "{'loss': 2.1399, 'grad_norm': 2.8605706691741943, 'learning_rate': 9.888130844596524e-05, 'epoch': 0.96}\n",
-      "{'loss': 2.1673, 'grad_norm': 3.161895275115967, 'learning_rate': 9.876940776850569e-05, 'epoch': 0.98}\n",
-      "{'loss': 2.1621, 'grad_norm': 3.304511785507202, 'learning_rate': 9.865224352899119e-05, 'epoch': 1.0}\n",
-      " 17%|██████▎                               | 560/3360 [15:54<1:17:19,  1.66s/it][INFO|trainer.py:3788] 2024-07-05 06:32:21,677 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-05 06:32:21,677 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-05 06:32:21,677 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:01, 24.61it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:02, 19.42it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:00<00:02, 18.32it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:00<00:02, 16.73it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:00<00:01, 16.68it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:00<00:01, 16.68it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:00<00:01, 16.66it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:01<00:01, 16.98it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:01<00:01, 17.04it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:01<00:01, 17.26it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:01<00:01, 17.35it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:01<00:01, 16.65it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:01<00:01, 16.79it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:01<00:00, 16.54it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:01<00:00, 15.94it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:02<00:00, 16.17it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:02<00:00, 16.44it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:02<00:00, 16.57it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:02<00:00, 16.44it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:02<00:00, 16.64it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.109107255935669, 'eval_runtime': 2.798, 'eval_samples_per_second': 16.44, 'eval_steps_per_second': 16.44, 'epoch': 1.0}\n",
-      " 17%|██████▎                               | 560/3360 [15:57<1:17:19,  1.66s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:02<00:00, 16.83it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-05 06:32:24,477 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-560\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 06:32:25,696 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-0.5b-instruct-bnb-4bit/snapshots/c3b24ce4827d69f5c3bde9aba00047774069ab72/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 06:32:25,697 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-0.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "{'loss': 1.6151, 'grad_norm': 3.5378007888793945, 'learning_rate': 9.852982837266955e-05, 'epoch': 1.02}\n",
-      "{'loss': 1.3799, 'grad_norm': 3.1997132301330566, 'learning_rate': 9.840217551150706e-05, 'epoch': 1.04}\n",
-      "{'loss': 1.6132, 'grad_norm': 2.48860502243042, 'learning_rate': 9.826929872276255e-05, 'epoch': 1.05}\n",
-      "{'loss': 1.4984, 'grad_norm': 3.7188329696655273, 'learning_rate': 9.81312123475006e-05, 'epoch': 1.07}\n",
-      "{'loss': 1.4967, 'grad_norm': 3.4040935039520264, 'learning_rate': 9.798793128904356e-05, 'epoch': 1.09}\n",
-      "{'loss': 1.5688, 'grad_norm': 3.603771924972534, 'learning_rate': 9.78394710113631e-05, 'epoch': 1.11}\n",
-      "{'loss': 1.4902, 'grad_norm': 3.248730421066284, 'learning_rate': 9.768584753741134e-05, 'epoch': 1.12}\n",
-      "{'loss': 1.4788, 'grad_norm': 4.081541538238525, 'learning_rate': 9.752707744739145e-05, 'epoch': 1.14}\n",
-      "{'loss': 1.5933, 'grad_norm': 3.348815441131592, 'learning_rate': 9.736317787696816e-05, 'epoch': 1.16}\n",
-      "{'loss': 1.4597, 'grad_norm': 5.059058666229248, 'learning_rate': 9.719416651541839e-05, 'epoch': 1.18}\n",
-      "{'loss': 1.5088, 'grad_norm': 2.929900646209717, 'learning_rate': 9.702006160372209e-05, 'epoch': 1.2}\n",
-      "{'loss': 1.5122, 'grad_norm': 3.9229655265808105, 'learning_rate': 9.684088193259355e-05, 'epoch': 1.21}\n",
-      "{'loss': 1.4982, 'grad_norm': 4.456009864807129, 'learning_rate': 9.665664684045333e-05, 'epoch': 1.23}\n",
-      "{'loss': 1.5631, 'grad_norm': 6.255136966705322, 'learning_rate': 9.646737621134112e-05, 'epoch': 1.25}\n",
-      "{'loss': 1.5067, 'grad_norm': 4.147162914276123, 'learning_rate': 9.627309047276974e-05, 'epoch': 1.27}\n",
-      "{'loss': 1.6788, 'grad_norm': 4.083860874176025, 'learning_rate': 9.607381059352038e-05, 'epoch': 1.29}\n",
-      "{'loss': 1.6006, 'grad_norm': 3.7379791736602783, 'learning_rate': 9.586955808137958e-05, 'epoch': 1.3}\n",
-      "{'loss': 1.6328, 'grad_norm': 3.6500179767608643, 'learning_rate': 9.566035498081784e-05, 'epoch': 1.32}\n",
-      "{'loss': 1.6155, 'grad_norm': 3.455841302871704, 'learning_rate': 9.544622387061055e-05, 'epoch': 1.34}\n",
-      "{'loss': 1.3868, 'grad_norm': 3.636683702468872, 'learning_rate': 9.522718786140097e-05, 'epoch': 1.36}\n",
-      "{'loss': 1.5776, 'grad_norm': 4.494875431060791, 'learning_rate': 9.500327059320606e-05, 'epoch': 1.37}\n",
-      "{'loss': 1.4877, 'grad_norm': 4.710891246795654, 'learning_rate': 9.477449623286505e-05, 'epoch': 1.39}\n",
-      "{'loss': 1.401, 'grad_norm': 3.5016818046569824, 'learning_rate': 9.454088947143116e-05, 'epoch': 1.41}\n",
-      "{'loss': 1.628, 'grad_norm': 4.40405797958374, 'learning_rate': 9.430247552150673e-05, 'epoch': 1.43}\n",
-      "{'loss': 1.4999, 'grad_norm': 3.74572491645813, 'learning_rate': 9.405928011452211e-05, 'epoch': 1.45}\n",
-      "{'loss': 1.5602, 'grad_norm': 4.144255638122559, 'learning_rate': 9.381132949795861e-05, 'epoch': 1.46}\n",
-      "{'loss': 1.6872, 'grad_norm': 4.109062671661377, 'learning_rate': 9.35586504325155e-05, 'epoch': 1.48}\n",
-      "{'loss': 1.5494, 'grad_norm': 7.194815635681152, 'learning_rate': 9.330127018922194e-05, 'epoch': 1.5}\n",
-      "{'loss': 1.4354, 'grad_norm': 3.779526948928833, 'learning_rate': 9.303921654649362e-05, 'epoch': 1.52}\n",
-      "{'loss': 1.593, 'grad_norm': 3.863893508911133, 'learning_rate': 9.277251778713474e-05, 'epoch': 1.54}\n",
-      "{'loss': 1.5795, 'grad_norm': 3.684547185897827, 'learning_rate': 9.250120269528546e-05, 'epoch': 1.55}\n",
-      "{'loss': 1.5245, 'grad_norm': 3.9775428771972656, 'learning_rate': 9.22253005533154e-05, 'epoch': 1.57}\n",
-      "{'loss': 1.631, 'grad_norm': 4.817204475402832, 'learning_rate': 9.194484113866313e-05, 'epoch': 1.59}\n",
-      "{'loss': 1.658, 'grad_norm': 3.928107738494873, 'learning_rate': 9.165985472062246e-05, 'epoch': 1.61}\n",
-      "{'loss': 1.464, 'grad_norm': 4.099756240844727, 'learning_rate': 9.137037205707552e-05, 'epoch': 1.62}\n",
-      "{'loss': 1.5206, 'grad_norm': 3.9024410247802734, 'learning_rate': 9.107642439117321e-05, 'epoch': 1.64}\n",
-      "{'loss': 1.6011, 'grad_norm': 3.7552289962768555, 'learning_rate': 9.077804344796302e-05, 'epoch': 1.66}\n",
-      "{'loss': 1.4891, 'grad_norm': 3.713045835494995, 'learning_rate': 9.04752614309652e-05, 'epoch': 1.68}\n",
-      "{'loss': 1.5139, 'grad_norm': 3.589451313018799, 'learning_rate': 9.01681110186971e-05, 'epoch': 1.7}\n",
-      "{'loss': 1.5901, 'grad_norm': 3.9955010414123535, 'learning_rate': 8.985662536114613e-05, 'epoch': 1.71}\n",
-      "{'loss': 1.5646, 'grad_norm': 3.6160426139831543, 'learning_rate': 8.954083807619208e-05, 'epoch': 1.73}\n",
-      "{'loss': 1.6884, 'grad_norm': 4.0372796058654785, 'learning_rate': 8.922078324597879e-05, 'epoch': 1.75}\n",
-      "{'loss': 1.6813, 'grad_norm': 4.466279983520508, 'learning_rate': 8.889649541323574e-05, 'epoch': 1.77}\n",
-      "{'loss': 1.5947, 'grad_norm': 5.11010217666626, 'learning_rate': 8.856800957755e-05, 'epoch': 1.78}\n",
-      "{'loss': 1.6637, 'grad_norm': 5.363622188568115, 'learning_rate': 8.823536119158864e-05, 'epoch': 1.8}\n",
-      "{'loss': 1.5541, 'grad_norm': 4.0909223556518555, 'learning_rate': 8.789858615727265e-05, 'epoch': 1.82}\n",
-      "{'loss': 1.523, 'grad_norm': 3.796602249145508, 'learning_rate': 8.755772082190194e-05, 'epoch': 1.84}\n",
-      "{'loss': 1.6437, 'grad_norm': 4.511483669281006, 'learning_rate': 8.721280197423258e-05, 'epoch': 1.86}\n",
-      "{'loss': 1.4852, 'grad_norm': 4.5722246170043945, 'learning_rate': 8.68638668405062e-05, 'epoch': 1.87}\n",
-      "{'loss': 1.5986, 'grad_norm': 4.731987953186035, 'learning_rate': 8.651095308043232e-05, 'epoch': 1.89}\n",
-      "{'loss': 1.7502, 'grad_norm': 6.07273530960083, 'learning_rate': 8.61540987831238e-05, 'epoch': 1.91}\n",
-      "{'loss': 1.6979, 'grad_norm': 5.418001651763916, 'learning_rate': 8.579334246298593e-05, 'epoch': 1.93}\n",
-      "{'loss': 1.5625, 'grad_norm': 4.6554341316223145, 'learning_rate': 8.542872305555978e-05, 'epoch': 1.95}\n",
-      "{'loss': 1.4509, 'grad_norm': 3.8252899646759033, 'learning_rate': 8.50602799133199e-05, 'epoch': 1.96}\n",
-      "{'loss': 1.5915, 'grad_norm': 4.251583099365234, 'learning_rate': 8.468805280142709e-05, 'epoch': 1.98}\n",
-      "{'loss': 1.6074, 'grad_norm': 4.587167739868164, 'learning_rate': 8.43120818934367e-05, 'epoch': 2.0}\n",
-      " 33%|████████████▎                        | 1120/3360 [31:19<1:00:24,  1.62s/it][INFO|trainer.py:3788] 2024-07-05 06:47:46,319 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-05 06:47:46,319 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-05 06:47:46,319 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:01, 21.50it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:02, 18.37it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:00<00:02, 17.59it/s]\u001b[A\n",
-      " 22%|█████████▎                                 | 10/46 [00:00<00:02, 17.41it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:00<00:01, 17.36it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:00<00:01, 17.38it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:00<00:01, 17.26it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:01<00:01, 16.55it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:01<00:01, 16.44it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:01<00:01, 16.77it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:01<00:01, 17.01it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:01<00:01, 16.91it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:01<00:01, 17.18it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:01<00:00, 16.93it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:01<00:00, 17.12it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:01<00:00, 17.01it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:02<00:00, 17.13it/s]\u001b[A\n",
-      " 83%|██���████████████████████████████████▌       | 38/46 [00:02<00:00, 16.83it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:02<00:00, 16.12it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:02<00:00, 15.90it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:02<00:00, 15.18it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.179692029953003, 'eval_runtime': 2.8332, 'eval_samples_per_second': 16.236, 'eval_steps_per_second': 16.236, 'epoch': 2.0}\n",
-      " 33%|████████████▎                        | 1120/3360 [31:22<1:00:24,  1.62s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:02<00:00, 15.18it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-05 06:47:49,154 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-1120\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 06:47:50,281 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-0.5b-instruct-bnb-4bit/snapshots/c3b24ce4827d69f5c3bde9aba00047774069ab72/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 06:47:50,282 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-0.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "{'loss': 0.9111, 'grad_norm': 5.008914470672607, 'learning_rate': 8.393240776696274e-05, 'epoch': 2.02}\n",
-      "{'loss': 0.863, 'grad_norm': 6.299067974090576, 'learning_rate': 8.354907139929851e-05, 'epoch': 2.03}\n",
-      "{'loss': 0.8515, 'grad_norm': 4.728297233581543, 'learning_rate': 8.316211416299397e-05, 'epoch': 2.05}\n",
-      "{'loss': 0.8316, 'grad_norm': 4.379367351531982, 'learning_rate': 8.27715778213905e-05, 'epoch': 2.07}\n",
-      "{'loss': 0.6608, 'grad_norm': 4.311402320861816, 'learning_rate': 8.237750452411353e-05, 'epoch': 2.09}\n",
-      "{'loss': 0.8713, 'grad_norm': 3.783642530441284, 'learning_rate': 8.197993680252334e-05, 'epoch': 2.11}\n",
-      "{'loss': 1.0158, 'grad_norm': 4.141658782958984, 'learning_rate': 8.157891756512488e-05, 'epoch': 2.12}\n",
-      "{'loss': 0.9155, 'grad_norm': 4.355412483215332, 'learning_rate': 8.117449009293668e-05, 'epoch': 2.14}\n",
-      "{'loss': 0.8407, 'grad_norm': 5.703305721282959, 'learning_rate': 8.076669803481965e-05, 'epoch': 2.16}\n",
-      "{'loss': 0.8494, 'grad_norm': 5.374706745147705, 'learning_rate': 8.035558540276618e-05, 'epoch': 2.18}\n",
-      "{'loss': 0.8743, 'grad_norm': 4.037242889404297, 'learning_rate': 7.994119656715002e-05, 'epoch': 2.2}\n",
-      "{'loss': 0.9841, 'grad_norm': 4.615417957305908, 'learning_rate': 7.952357625193749e-05, 'epoch': 2.21}\n",
-      "{'loss': 0.9296, 'grad_norm': 4.376211643218994, 'learning_rate': 7.91027695298606e-05, 'epoch': 2.23}\n",
-      "{'loss': 0.9142, 'grad_norm': 4.084548473358154, 'learning_rate': 7.86788218175523e-05, 'epoch': 2.25}\n",
-      "{'loss': 0.8517, 'grad_norm': 4.527939796447754, 'learning_rate': 7.8251778870645e-05, 'epoch': 2.27}\n",
-      "{'loss': 0.9113, 'grad_norm': 5.170512676239014, 'learning_rate': 7.782168677883206e-05, 'epoch': 2.28}\n",
-      "{'loss': 0.9332, 'grad_norm': 4.342284202575684, 'learning_rate': 7.738859196089358e-05, 'epoch': 2.3}\n",
-      "{'loss': 0.9759, 'grad_norm': 4.931323051452637, 'learning_rate': 7.695254115968648e-05, 'epoch': 2.32}\n",
-      "{'loss': 1.0079, 'grad_norm': 3.684819459915161, 'learning_rate': 7.651358143709972e-05, 'epoch': 2.34}\n",
-      "{'loss': 0.9958, 'grad_norm': 5.162328720092773, 'learning_rate': 7.60717601689749e-05, 'epoch': 2.36}\n",
-      "{'loss': 0.9528, 'grad_norm': 4.386671543121338, 'learning_rate': 7.562712503999327e-05, 'epoch': 2.37}\n",
-      "{'loss': 1.1468, 'grad_norm': 5.785244464874268, 'learning_rate': 7.517972403852905e-05, 'epoch': 2.39}\n",
-      "{'loss': 0.9291, 'grad_norm': 4.308371543884277, 'learning_rate': 7.472960545147038e-05, 'epoch': 2.41}\n",
-      "{'loss': 0.8408, 'grad_norm': 5.942112922668457, 'learning_rate': 7.427681785900761e-05, 'epoch': 2.43}\n",
-      "{'loss': 0.9693, 'grad_norm': 4.682136535644531, 'learning_rate': 7.382141012939034e-05, 'epoch': 2.45}\n",
-      "{'loss': 0.8726, 'grad_norm': 4.883449077606201, 'learning_rate': 7.33634314136531e-05, 'epoch': 2.46}\n",
-      "{'loss': 0.9426, 'grad_norm': 4.833103656768799, 'learning_rate': 7.290293114031061e-05, 'epoch': 2.48}\n",
-      "{'loss': 1.0333, 'grad_norm': 4.8503289222717285, 'learning_rate': 7.243995901002312e-05, 'epoch': 2.5}\n",
-      "{'loss': 0.9984, 'grad_norm': 4.3091230392456055, 'learning_rate': 7.197456499023225e-05, 'epoch': 2.52}\n",
-      "{'loss': 1.0019, 'grad_norm': 4.726260662078857, 'learning_rate': 7.150679930976825e-05, 'epoch': 2.53}\n",
-      "{'loss': 0.9594, 'grad_norm': 3.850511312484741, 'learning_rate': 7.103671245342887e-05, 'epoch': 2.55}\n",
-      "{'loss': 0.8701, 'grad_norm': 5.5012030601501465, 'learning_rate': 7.056435515653059e-05, 'epoch': 2.57}\n",
-      "{'loss': 1.0956, 'grad_norm': 5.610720157623291, 'learning_rate': 7.008977839943299e-05, 'epoch': 2.59}\n",
-      "{'loss': 0.9175, 'grad_norm': 3.8002779483795166, 'learning_rate': 6.961303340203653e-05, 'epoch': 2.61}\n",
-      "{'loss': 1.0243, 'grad_norm': 5.210932731628418, 'learning_rate': 6.91341716182545e-05, 'epoch': 2.62}\n",
-      "{'loss': 0.9902, 'grad_norm': 3.9311327934265137, 'learning_rate': 6.86532447304597e-05, 'epoch': 2.64}\n",
-      "{'loss': 0.9589, 'grad_norm': 4.984393119812012, 'learning_rate': 6.817030464390656e-05, 'epoch': 2.66}\n",
-      "{'loss': 0.9985, 'grad_norm': 4.881758689880371, 'learning_rate': 6.768540348112907e-05, 'epoch': 2.68}\n",
-      "{'loss': 0.8961, 'grad_norm': 6.465915203094482, 'learning_rate': 6.719859357631535e-05, 'epoch': 2.7}\n",
-      "{'loss': 0.8434, 'grad_norm': 5.6094183921813965, 'learning_rate': 6.670992746965938e-05, 'epoch': 2.71}\n",
-      "{'loss': 1.0485, 'grad_norm': 5.219779968261719, 'learning_rate': 6.621945790169036e-05, 'epoch': 2.73}\n",
-      "{'loss': 1.0165, 'grad_norm': 5.263071060180664, 'learning_rate': 6.572723780758069e-05, 'epoch': 2.75}\n",
-      "{'loss': 0.9104, 'grad_norm': 4.919801235198975, 'learning_rate': 6.523332031143272e-05, 'epoch': 2.77}\n",
-      "{'loss': 0.9633, 'grad_norm': 4.69899320602417, 'learning_rate': 6.473775872054521e-05, 'epoch': 2.78}\n",
-      "{'loss': 0.8483, 'grad_norm': 4.0923285484313965, 'learning_rate': 6.424060651966007e-05, 'epoch': 2.8}\n",
-      "{'loss': 0.8888, 'grad_norm': 5.461803436279297, 'learning_rate': 6.374191736518974e-05, 'epoch': 2.82}\n",
-      "{'loss': 0.9598, 'grad_norm': 4.758564472198486, 'learning_rate': 6.324174507942637e-05, 'epoch': 2.84}\n",
-      "{'loss': 0.9436, 'grad_norm': 6.395792007446289, 'learning_rate': 6.274014364473274e-05, 'epoch': 2.86}\n",
-      "{'loss': 1.1634, 'grad_norm': 6.077510356903076, 'learning_rate': 6.22371671977162e-05, 'epoch': 2.87}\n",
-      "{'loss': 1.0049, 'grad_norm': 5.1858720779418945, 'learning_rate': 6.173287002338577e-05, 'epoch': 2.89}\n",
-      "{'loss': 0.9795, 'grad_norm': 6.103806972503662, 'learning_rate': 6.122730654929334e-05, 'epoch': 2.91}\n",
-      "{'loss': 0.9422, 'grad_norm': 5.469768524169922, 'learning_rate': 6.072053133965938e-05, 'epoch': 2.93}\n",
-      "{'loss': 1.0349, 'grad_norm': 4.436359405517578, 'learning_rate': 6.021259908948402e-05, 'epoch': 2.95}\n",
-      "{'loss': 1.1161, 'grad_norm': 5.872861862182617, 'learning_rate': 5.970356461864391e-05, 'epoch': 2.96}\n",
-      "{'loss': 0.9069, 'grad_norm': 5.360676288604736, 'learning_rate': 5.919348286597569e-05, 'epoch': 2.98}\n",
-      "{'loss': 1.0593, 'grad_norm': 4.815310001373291, 'learning_rate': 5.868240888334653e-05, 'epoch': 3.0}\n",
-      " 50%|███████████████████▌                   | 1680/3360 [46:46<45:31,  1.63s/it][INFO|trainer.py:3788] 2024-07-05 07:03:13,485 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-05 07:03:13,485 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-05 07:03:13,485 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:02, 20.21it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:02, 17.41it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:00<00:02, 17.26it/s]\u001b[A\n",
-      " 22%|█████████▎                                 | 10/46 [00:00<00:02, 16.97it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:00<00:02, 16.11it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:00<00:01, 16.37it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:00<00:01, 16.50it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:01<00:01, 15.76it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:01<00:01, 15.94it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:01<00:01, 16.48it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:01<00:01, 16.40it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:01<00:01, 16.08it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:01<00:01, 16.55it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:01<00:01, 15.12it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:02<00:00, 14.44it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:02<00:00, 14.78it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:02<00:00, 15.38it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:02<00:00, 15.94it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:02<00:00, 16.33it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:02<00:00, 16.56it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:02<00:00, 16.62it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.4282326698303223, 'eval_runtime': 2.8929, 'eval_samples_per_second': 15.901, 'eval_steps_per_second': 15.901, 'epoch': 3.0}\n",
-      " 50%|███████████████████▌                   | 1680/3360 [46:49<45:31,  1.63s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:02<00:00, 16.86it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-05 07:03:16,380 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-1680\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 07:03:17,790 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-0.5b-instruct-bnb-4bit/snapshots/c3b24ce4827d69f5c3bde9aba00047774069ab72/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 07:03:17,790 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-0.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "{'loss': 0.6987, 'grad_norm': 4.28726863861084, 'learning_rate': 5.8170397829712485e-05, 'epoch': 3.02}\n",
-      "{'loss': 0.3462, 'grad_norm': 5.342904567718506, 'learning_rate': 5.765750496516547e-05, 'epoch': 3.03}\n",
-      "{'loss': 0.4899, 'grad_norm': 3.8532354831695557, 'learning_rate': 5.714378564496901e-05, 'epoch': 3.05}\n",
-      "{'loss': 0.4609, 'grad_norm': 4.3072590827941895, 'learning_rate': 5.6629295313583974e-05, 'epoch': 3.07}\n",
-      "{'loss': 0.4106, 'grad_norm': 4.2518463134765625, 'learning_rate': 5.611408949868457e-05, 'epoch': 3.09}\n",
-      "{'loss': 0.5169, 'grad_norm': 4.579401016235352, 'learning_rate': 5.559822380516539e-05, 'epoch': 3.11}\n",
-      "{'loss': 0.4794, 'grad_norm': 3.6858370304107666, 'learning_rate': 5.5081753909140096e-05, 'epoch': 3.12}\n",
-      "{'loss': 0.5473, 'grad_norm': 8.67149543762207, 'learning_rate': 5.456473555193242e-05, 'epoch': 3.14}\n",
-      "{'loss': 0.4638, 'grad_norm': 6.095928192138672, 'learning_rate': 5.404722453406017e-05, 'epoch': 3.16}\n",
-      "{'loss': 0.4697, 'grad_norm': 6.712044715881348, 'learning_rate': 5.3529276709212816e-05, 'epoch': 3.18}\n",
-      "{'loss': 0.4869, 'grad_norm': 4.1765336990356445, 'learning_rate': 5.30109479782233e-05, 'epoch': 3.2}\n",
-      "{'loss': 0.4821, 'grad_norm': 4.068556308746338, 'learning_rate': 5.249229428303486e-05, 'epoch': 3.21}\n",
-      "{'loss': 0.6011, 'grad_norm': 3.6553525924682617, 'learning_rate': 5.197337160066331e-05, 'epoch': 3.23}\n",
-      "{'loss': 0.4558, 'grad_norm': 4.888422012329102, 'learning_rate': 5.145423593715557e-05, 'epoch': 3.25}\n",
-      "{'loss': 0.5203, 'grad_norm': 4.138525009155273, 'learning_rate': 5.0934943321545115e-05, 'epoch': 3.27}\n",
-      "{'loss': 0.3826, 'grad_norm': 4.2213358879089355, 'learning_rate': 5.041554979980486e-05, 'epoch': 3.28}\n",
-      "{'loss': 0.5895, 'grad_norm': 4.9374260902404785, 'learning_rate': 4.9896111428798254e-05, 'epoch': 3.3}\n",
-      "{'loss': 0.5609, 'grad_norm': 4.482494831085205, 'learning_rate': 4.9376684270229254e-05, 'epoch': 3.32}\n",
-      "{'loss': 0.5478, 'grad_norm': 3.9575753211975098, 'learning_rate': 4.8857324384591653e-05, 'epoch': 3.34}\n",
-      "{'loss': 0.4865, 'grad_norm': 5.01925802230835, 'learning_rate': 4.8338087825118675e-05, 'epoch': 3.36}\n",
-      "{'loss': 0.5365, 'grad_norm': 4.109598636627197, 'learning_rate': 4.781903063173321e-05, 'epoch': 3.37}\n",
-      "{'loss': 0.4814, 'grad_norm': 3.7702512741088867, 'learning_rate': 4.730020882499964e-05, 'epoch': 3.39}\n",
-      "{'loss': 0.5355, 'grad_norm': 6.243114948272705, 'learning_rate': 4.678167840007767e-05, 'epoch': 3.41}\n",
-      "{'loss': 0.5361, 'grad_norm': 4.488025188446045, 'learning_rate': 4.626349532067879e-05, 'epoch': 3.43}\n",
-      "{'loss': 0.5952, 'grad_norm': 4.389721870422363, 'learning_rate': 4.574571551302647e-05, 'epoch': 3.44}\n",
-      "{'loss': 0.6049, 'grad_norm': 4.847557067871094, 'learning_rate': 4.522839485981994e-05, 'epoch': 3.46}\n",
-      "{'loss': 0.5697, 'grad_norm': 3.9925057888031006, 'learning_rate': 4.471158919420312e-05, 'epoch': 3.48}\n",
-      "{'loss': 0.5018, 'grad_norm': 5.327306747436523, 'learning_rate': 4.4195354293738484e-05, 'epoch': 3.5}\n",
-      "{'loss': 0.4745, 'grad_norm': 5.380455493927002, 'learning_rate': 4.367974587438733e-05, 'epoch': 3.52}\n",
-      "{'loss': 0.5421, 'grad_norm': 3.978426694869995, 'learning_rate': 4.316481958449634e-05, 'epoch': 3.53}\n",
-      "{'loss': 0.5091, 'grad_norm': 8.685088157653809, 'learning_rate': 4.2650630998791615e-05, 'epoch': 3.55}\n",
-      "{'loss': 0.6102, 'grad_norm': 4.471510887145996, 'learning_rate': 4.213723561238074e-05, 'epoch': 3.57}\n",
-      "{'loss': 0.4623, 'grad_norm': 4.236584663391113, 'learning_rate': 4.162468883476319e-05, 'epoch': 3.59}\n",
-      "{'loss': 0.5203, 'grad_norm': 5.698358535766602, 'learning_rate': 4.111304598385018e-05, 'epoch': 3.61}\n",
-      "{'loss': 0.5314, 'grad_norm': 5.975699424743652, 'learning_rate': 4.060236227999441e-05, 'epoch': 3.62}\n",
-      "{'loss': 0.484, 'grad_norm': 5.300996780395508, 'learning_rate': 4.0092692840030134e-05, 'epoch': 3.64}\n",
-      "{'loss': 0.4564, 'grad_norm': 7.857934474945068, 'learning_rate': 3.9584092671324606e-05, 'epoch': 3.66}\n",
-      "{'loss': 0.5715, 'grad_norm': 3.796581268310547, 'learning_rate': 3.907661666584131e-05, 'epoch': 3.68}\n",
-      "{'loss': 0.539, 'grad_norm': 4.170958995819092, 'learning_rate': 3.857031959421553e-05, 'epoch': 3.69}\n",
-      "{'loss': 0.5249, 'grad_norm': 6.283390045166016, 'learning_rate': 3.806525609984312e-05, 'epoch': 3.71}\n",
-      "{'loss': 0.4406, 'grad_norm': 6.235040664672852, 'learning_rate': 3.7561480692983006e-05, 'epoch': 3.73}\n",
-      "{'loss': 0.553, 'grad_norm': 3.715141534805298, 'learning_rate': 3.705904774487396e-05, 'epoch': 3.75}\n",
-      "{'loss': 0.5154, 'grad_norm': 6.352488040924072, 'learning_rate': 3.655801148186655e-05, 'epoch': 3.77}\n",
-      "{'loss': 0.4681, 'grad_norm': 4.480152130126953, 'learning_rate': 3.6058425979570485e-05, 'epoch': 3.78}\n",
-      "{'loss': 0.4915, 'grad_norm': 5.1917219161987305, 'learning_rate': 3.556034515701852e-05, 'epoch': 3.8}\n",
-      "{'loss': 0.5371, 'grad_norm': 4.501936912536621, 'learning_rate': 3.506382277084696e-05, 'epoch': 3.82}\n",
-      "{'loss': 0.5273, 'grad_norm': 3.53322434425354, 'learning_rate': 3.4568912409493945e-05, 'epoch': 3.84}\n",
-      "{'loss': 0.4405, 'grad_norm': 4.688470840454102, 'learning_rate': 3.4075667487415785e-05, 'epoch': 3.86}\n",
-      "{'loss': 0.5048, 'grad_norm': 6.739779949188232, 'learning_rate': 3.358414123932195e-05, 'epoch': 3.87}\n",
-      "{'loss': 0.5572, 'grad_norm': 4.120084762573242, 'learning_rate': 3.3094386714429724e-05, 'epoch': 3.89}\n",
-      "{'loss': 0.5498, 'grad_norm': 7.938605785369873, 'learning_rate': 3.2606456770738636e-05, 'epoch': 3.91}\n",
-      "{'loss': 0.3955, 'grad_norm': 4.132835865020752, 'learning_rate': 3.212040406932569e-05, 'epoch': 3.93}\n",
-      "{'loss': 0.4149, 'grad_norm': 3.777303457260132, 'learning_rate': 3.163628106866172e-05, 'epoch': 3.94}\n",
-      "{'loss': 0.6278, 'grad_norm': 5.201406955718994, 'learning_rate': 3.115414001894974e-05, 'epoch': 3.96}\n",
-      "{'loss': 0.5186, 'grad_norm': 7.258588790893555, 'learning_rate': 3.067403295648566e-05, 'epoch': 3.98}\n",
-      "{'loss': 0.4754, 'grad_norm': 3.8839337825775146, 'learning_rate': 3.019601169804216e-05, 'epoch': 4.0}\n",
-      " 67%|████████████████████████▋            | 2240/3360 [1:01:49<29:29,  1.58s/it][INFO|trainer.py:3788] 2024-07-05 07:18:16,440 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-05 07:18:16,441 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-05 07:18:16,441 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:01, 22.55it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:02, 19.36it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:00<00:02, 18.55it/s]\u001b[A\n",
-      " 22%|█████████▎                                 | 10/46 [00:00<00:02, 17.05it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:00<00:02, 16.37it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:00<00:01, 16.48it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:00<00:01, 16.70it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:01<00:01, 16.96it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:01<00:01, 17.35it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:01<00:01, 17.29it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:01<00:01, 17.27it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:01<00:01, 17.07it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:01<00:01, 17.13it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:01<00:00, 16.29it/s]\u001b[A\n",
-      " 70%|█���███████████████████████████▉             | 32/46 [00:01<00:00, 16.28it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:01<00:00, 16.60it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:02<00:00, 16.25it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:02<00:00, 16.33it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:02<00:00, 16.07it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:02<00:00, 16.22it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:02<00:00, 15.96it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 2.7581844329833984, 'eval_runtime': 2.8365, 'eval_samples_per_second': 16.217, 'eval_steps_per_second': 16.217, 'epoch': 4.0}\n",
-      " 67%|████████████████████████▋            | 2240/3360 [1:01:52<29:29,  1.58s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:02<00:00, 15.12it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-05 07:18:19,279 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-2240\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 07:18:20,481 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-0.5b-instruct-bnb-4bit/snapshots/c3b24ce4827d69f5c3bde9aba00047774069ab72/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 07:18:20,481 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-0.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "{'loss': 0.3004, 'grad_norm': 3.442147731781006, 'learning_rate': 2.9720127835276256e-05, 'epoch': 4.02}\n",
-      "{'loss': 0.2198, 'grad_norm': 3.406682252883911, 'learning_rate': 2.9246432729161055e-05, 'epoch': 4.03}\n",
-      "{'loss': 0.2312, 'grad_norm': 3.0993845462799072, 'learning_rate': 2.8774977504442647e-05, 'epoch': 4.05}\n",
-      "{'loss': 0.2038, 'grad_norm': 4.1203694343566895, 'learning_rate': 2.8305813044122097e-05, 'epoch': 4.07}\n",
-      "{'loss': 0.1819, 'grad_norm': 2.116147756576538, 'learning_rate': 2.7838989983964065e-05, 'epoch': 4.09}\n",
-      "{'loss': 0.2672, 'grad_norm': 3.2091379165649414, 'learning_rate': 2.737455870703155e-05, 'epoch': 4.11}\n",
-      "{'loss': 0.1779, 'grad_norm': 2.386085033416748, 'learning_rate': 2.6912569338248315e-05, 'epoch': 4.12}\n",
-      "{'loss': 0.2997, 'grad_norm': 28.640592575073242, 'learning_rate': 2.645307173898901e-05, 'epoch': 4.14}\n",
-      "{'loss': 0.227, 'grad_norm': 2.2596945762634277, 'learning_rate': 2.5996115501697694e-05, 'epoch': 4.16}\n",
-      "{'loss': 0.2184, 'grad_norm': 4.521151065826416, 'learning_rate': 2.5541749944535554e-05, 'epoch': 4.18}\n",
-      "{'loss': 0.2038, 'grad_norm': 5.861654281616211, 'learning_rate': 2.5090024106057962e-05, 'epoch': 4.19}\n",
-      "{'loss': 0.2423, 'grad_norm': 4.1528639793396, 'learning_rate': 2.464098673992205e-05, 'epoch': 4.21}\n",
-      "{'loss': 0.2367, 'grad_norm': 2.1180801391601562, 'learning_rate': 2.4194686309624663e-05, 'epoch': 4.23}\n",
-      "{'loss': 0.2627, 'grad_norm': 4.030113697052002, 'learning_rate': 2.3751170983272e-05, 'epoch': 4.25}\n",
-      "{'loss': 0.2329, 'grad_norm': 4.907358646392822, 'learning_rate': 2.3310488628380757e-05, 'epoch': 4.27}\n",
-      "{'loss': 0.2542, 'grad_norm': 4.652915000915527, 'learning_rate': 2.2872686806712035e-05, 'epoch': 4.28}\n",
-      "{'loss': 0.2495, 'grad_norm': 5.10890531539917, 'learning_rate': 2.243781276913811e-05, 'epoch': 4.3}\n",
-      "{'loss': 0.1662, 'grad_norm': 3.823878288269043, 'learning_rate': 2.200591345054267e-05, 'epoch': 4.32}\n",
-      "{'loss': 0.2812, 'grad_norm': 3.004128932952881, 'learning_rate': 2.157703546475539e-05, 'epoch': 4.34}\n",
-      "{'loss': 0.2202, 'grad_norm': 4.443856716156006, 'learning_rate': 2.115122509952085e-05, 'epoch': 4.36}\n",
-      "{'loss': 0.2244, 'grad_norm': 2.996962070465088, 'learning_rate': 2.0728528311502976e-05, 'epoch': 4.37}\n",
-      "{'loss': 0.2474, 'grad_norm': 4.116214752197266, 'learning_rate': 2.0308990721324927e-05, 'epoch': 4.39}\n",
-      "{'loss': 0.1881, 'grad_norm': 4.773007392883301, 'learning_rate': 1.989265760864542e-05, 'epoch': 4.41}\n",
-      "{'loss': 0.2721, 'grad_norm': 3.045060873031616, 'learning_rate': 1.947957390727185e-05, 'epoch': 4.43}\n",
-      "{'loss': 0.2474, 'grad_norm': 5.480595111846924, 'learning_rate': 1.906978420031059e-05, 'epoch': 4.44}\n",
-      "{'loss': 0.1786, 'grad_norm': 2.452791929244995, 'learning_rate': 1.8663332715355396e-05, 'epoch': 4.46}\n",
-      "{'loss': 0.2655, 'grad_norm': 1.6951186656951904, 'learning_rate': 1.8260263319713844e-05, 'epoch': 4.48}\n",
-      "{'loss': 0.2307, 'grad_norm': 4.780274868011475, 'learning_rate': 1.7860619515673033e-05, 'epoch': 4.5}\n",
-      "{'loss': 0.2661, 'grad_norm': 4.14153528213501, 'learning_rate': 1.746444443580433e-05, 'epoch': 4.52}\n",
-      "{'loss': 0.2482, 'grad_norm': 4.6406989097595215, 'learning_rate': 1.7071780838308288e-05, 'epoch': 4.53}\n",
-      "{'loss': 0.2268, 'grad_norm': 3.8813576698303223, 'learning_rate': 1.6682671102399805e-05, 'epoch': 4.55}\n",
-      "{'loss': 0.2058, 'grad_norm': 3.428504467010498, 'learning_rate': 1.629715722373423e-05, 'epoch': 4.57}\n",
-      "{'loss': 0.2407, 'grad_norm': 4.3143415451049805, 'learning_rate': 1.5915280809874932e-05, 'epoch': 4.59}\n",
-      "{'loss': 0.2153, 'grad_norm': 4.420351505279541, 'learning_rate': 1.553708307580265e-05, 'epoch': 4.61}\n",
-      "{'loss': 0.2423, 'grad_norm': 3.758807897567749, 'learning_rate': 1.5162604839467265e-05, 'epoch': 4.62}\n",
-      "{'loss': 0.2376, 'grad_norm': 3.164726734161377, 'learning_rate': 1.4791886517382413e-05, 'epoch': 4.64}\n",
-      "{'loss': 0.223, 'grad_norm': 2.6924712657928467, 'learning_rate': 1.4424968120263504e-05, 'epoch': 4.66}\n",
-      "{'loss': 0.224, 'grad_norm': 3.1326253414154053, 'learning_rate': 1.4061889248709343e-05, 'epoch': 4.68}\n",
-      "{'loss': 0.2097, 'grad_norm': 3.1166789531707764, 'learning_rate': 1.370268908892825e-05, 'epoch': 4.69}\n",
-      "{'loss': 0.2607, 'grad_norm': 4.3387651443481445, 'learning_rate': 1.3347406408508695e-05, 'epoch': 4.71}\n",
-      "{'loss': 0.2217, 'grad_norm': 2.9194934368133545, 'learning_rate': 1.2996079552235263e-05, 'epoch': 4.73}\n",
-      "{'loss': 0.175, 'grad_norm': 2.6297366619110107, 'learning_rate': 1.264874643795021e-05, 'epoch': 4.75}\n",
-      "{'loss': 0.2148, 'grad_norm': 3.174553632736206, 'learning_rate': 1.230544455246101e-05, 'epoch': 4.77}\n",
-      "{'loss': 0.246, 'grad_norm': 3.611652374267578, 'learning_rate': 1.1966210947494583e-05, 'epoch': 4.78}\n",
-      "{'loss': 0.2477, 'grad_norm': 3.13002610206604, 'learning_rate': 1.1631082235698316e-05, 'epoch': 4.8}\n",
-      "{'loss': 0.2034, 'grad_norm': 3.1411221027374268, 'learning_rate': 1.130009458668863e-05, 'epoch': 4.82}\n",
-      "{'loss': 0.1899, 'grad_norm': 3.8253543376922607, 'learning_rate': 1.097328372314721e-05, 'epoch': 4.84}\n",
-      "{'loss': 0.2432, 'grad_norm': 4.582285404205322, 'learning_rate': 1.0650684916965559e-05, 'epoch': 4.85}\n",
-      "{'loss': 0.2412, 'grad_norm': 3.9309003353118896, 'learning_rate': 1.0332332985438248e-05, 'epoch': 4.87}\n",
-      "{'loss': 0.2543, 'grad_norm': 4.181048393249512, 'learning_rate': 1.0018262287505086e-05, 'epoch': 4.89}\n",
-      "{'loss': 0.2759, 'grad_norm': 1.8343684673309326, 'learning_rate': 9.708506720042932e-06, 'epoch': 4.91}\n",
-      "{'loss': 0.189, 'grad_norm': 2.335709571838379, 'learning_rate': 9.403099714207175e-06, 'epoch': 4.93}\n",
-      "{'loss': 0.2495, 'grad_norm': 4.065298080444336, 'learning_rate': 9.102074231823727e-06, 'epoch': 4.94}\n",
-      "{'loss': 0.2557, 'grad_norm': 2.8378493785858154, 'learning_rate': 8.805462761831418e-06, 'epoch': 4.96}\n",
-      "{'loss': 0.2784, 'grad_norm': 3.395693063735962, 'learning_rate': 8.513297316775625e-06, 'epoch': 4.98}\n",
-      "{'loss': 0.2621, 'grad_norm': 4.082712173461914, 'learning_rate': 8.225609429353187e-06, 'epoch': 5.0}\n",
-      " 83%|██████████████████████████████▊      | 2800/3360 [1:16:55<15:13,  1.63s/it][INFO|trainer.py:3788] 2024-07-05 07:33:22,309 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-05 07:33:22,309 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-05 07:33:22,310 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:01, 25.87it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:02, 18.98it/s]\u001b[A\n",
-      " 20%|████████▌                                   | 9/46 [00:00<00:02, 17.81it/s]\u001b[A\n",
-      " 24%|██████████▎                                | 11/46 [00:00<00:01, 17.53it/s]\u001b[A\n",
-      " 28%|████████████▏                              | 13/46 [00:00<00:01, 17.51it/s]\u001b[A\n",
-      " 33%|██████████████                             | 15/46 [00:00<00:01, 17.04it/s]\u001b[A\n",
-      " 37%|███████████████▉                           | 17/46 [00:00<00:01, 17.21it/s]\u001b[A\n",
-      " 41%|█████████████████▊                         | 19/46 [00:01<00:01, 16.95it/s]\u001b[A\n",
-      " 46%|███████████████████▋                       | 21/46 [00:01<00:01, 16.41it/s]\u001b[A\n",
-      " 50%|█████████████████████▌                     | 23/46 [00:01<00:01, 16.44it/s]\u001b[A\n",
-      " 54%|███████████████████████▎                   | 25/46 [00:01<00:01, 16.30it/s]\u001b[A\n",
-      " 59%|█████████████████████████▏                 | 27/46 [00:01<00:01, 15.42it/s]\u001b[A\n",
-      " 63%|███████████████████████████                | 29/46 [00:01<00:01, 16.03it/s]\u001b[A\n",
-      " 67%|████████████████████████████▉              | 31/46 [00:01<00:00, 15.59it/s]\u001b[A\n",
-      " 72%|██████████████████████████████▊            | 33/46 [00:01<00:00, 15.23it/s]\u001b[A\n",
-      " 76%|████████████████████████████████▋          | 35/46 [00:02<00:00, 13.78it/s]\u001b[A\n",
-      " 80%|██████████████████████████████████▌        | 37/46 [00:02<00:00, 14.60it/s]\u001b[A\n",
-      " 85%|████████████████████████████████████▍      | 39/46 [00:02<00:00, 14.85it/s]\u001b[A\n",
-      " 89%|██████████████████████████████████████▎    | 41/46 [00:02<00:00, 14.92it/s]\u001b[A\n",
-      " 93%|████████████████████████████████████████▏  | 43/46 [00:02<00:00, 15.67it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 3.1747782230377197, 'eval_runtime': 2.9312, 'eval_samples_per_second': 15.693, 'eval_steps_per_second': 15.693, 'epoch': 5.0}\n",
-      " 83%|██████████████████████████████▊      | 2800/3360 [1:16:58<15:13,  1.63s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:02<00:00, 15.43it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-05 07:33:25,242 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-2800\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 07:33:26,470 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-0.5b-instruct-bnb-4bit/snapshots/c3b24ce4827d69f5c3bde9aba00047774069ab72/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 07:33:26,471 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-0.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "{'loss': 0.1274, 'grad_norm': 1.5695966482162476, 'learning_rate': 7.942430149009161e-06, 'epoch': 5.02}\n",
-      "{'loss': 0.1052, 'grad_norm': 0.5931769609451294, 'learning_rate': 7.663790038585793e-06, 'epoch': 5.03}\n",
-      "{'loss': 0.1188, 'grad_norm': 4.6362762451171875, 'learning_rate': 7.389719171023857e-06, 'epoch': 5.05}\n",
-      "{'loss': 0.1032, 'grad_norm': 2.54799485206604, 'learning_rate': 7.1202471261170245e-06, 'epoch': 5.07}\n",
-      "{'loss': 0.0781, 'grad_norm': 2.0741422176361084, 'learning_rate': 6.855402987319348e-06, 'epoch': 5.09}\n",
-      "{'loss': 0.1428, 'grad_norm': 2.47188401222229, 'learning_rate': 6.595215338606397e-06, 'epoch': 5.1}\n",
-      "{'loss': 0.0839, 'grad_norm': 1.993886947631836, 'learning_rate': 6.339712261390213e-06, 'epoch': 5.12}\n",
-      "{'loss': 0.1086, 'grad_norm': 1.442935824394226, 'learning_rate': 6.088921331488568e-06, 'epoch': 5.14}\n",
-      "{'loss': 0.0851, 'grad_norm': 2.1466658115386963, 'learning_rate': 5.8428696161488215e-06, 'epoch': 5.16}\n",
-      "{'loss': 0.0964, 'grad_norm': 1.960119366645813, 'learning_rate': 5.601583671126531e-06, 'epoch': 5.18}\n",
-      "{'loss': 0.1149, 'grad_norm': 1.3245364427566528, 'learning_rate': 5.365089537819434e-06, 'epoch': 5.19}\n",
-      "{'loss': 0.1074, 'grad_norm': 0.817304253578186, 'learning_rate': 5.133412740456806e-06, 'epoch': 5.21}\n",
-      "{'loss': 0.0966, 'grad_norm': 1.4587805271148682, 'learning_rate': 4.906578283344759e-06, 'epoch': 5.23}\n",
-      "{'loss': 0.1326, 'grad_norm': 5.115628719329834, 'learning_rate': 4.684610648167503e-06, 'epoch': 5.25}\n",
-      "{'loss': 0.1112, 'grad_norm': 2.1370065212249756, 'learning_rate': 4.467533791345191e-06, 'epoch': 5.27}\n",
-      "{'loss': 0.0918, 'grad_norm': 1.5177031755447388, 'learning_rate': 4.255371141448272e-06, 'epoch': 5.28}\n",
-      "{'loss': 0.0911, 'grad_norm': 3.415386199951172, 'learning_rate': 4.048145596668967e-06, 'epoch': 5.3}\n",
-      "{'loss': 0.1295, 'grad_norm': 9.106415748596191, 'learning_rate': 3.84587952234991e-06, 'epoch': 5.32}\n",
-      "{'loss': 0.0753, 'grad_norm': 1.1960046291351318, 'learning_rate': 3.6485947485702832e-06, 'epoch': 5.34}\n",
-      "{'loss': 0.1281, 'grad_norm': 3.4662070274353027, 'learning_rate': 3.4563125677897932e-06, 'epoch': 5.35}\n",
-      "{'loss': 0.1045, 'grad_norm': 1.4903005361557007, 'learning_rate': 3.269053732550581e-06, 'epoch': 5.37}\n",
-      "{'loss': 0.094, 'grad_norm': 2.3145623207092285, 'learning_rate': 3.086838453237506e-06, 'epoch': 5.39}\n",
-      "{'loss': 0.056, 'grad_norm': 1.6177632808685303, 'learning_rate': 2.9096863958968268e-06, 'epoch': 5.41}\n",
-      "{'loss': 0.093, 'grad_norm': 1.7712160348892212, 'learning_rate': 2.737616680113758e-06, 'epoch': 5.43}\n",
-      "{'loss': 0.0846, 'grad_norm': 2.1207849979400635, 'learning_rate': 2.570647876948895e-06, 'epoch': 5.44}\n",
-      "{'loss': 0.1257, 'grad_norm': 1.7891684770584106, 'learning_rate': 2.408798006933882e-06, 'epoch': 5.46}\n",
-      "{'loss': 0.1472, 'grad_norm': 1.305862545967102, 'learning_rate': 2.252084538126542e-06, 'epoch': 5.48}\n",
-      "{'loss': 0.0784, 'grad_norm': 2.511289596557617, 'learning_rate': 2.100524384225555e-06, 'epoch': 5.5}\n",
-      "{'loss': 0.1159, 'grad_norm': 2.205674886703491, 'learning_rate': 1.9541339027450256e-06, 'epoch': 5.52}\n",
-      "{'loss': 0.1057, 'grad_norm': 2.3121867179870605, 'learning_rate': 1.8129288932490274e-06, 'epoch': 5.53}\n",
-      "{'loss': 0.1044, 'grad_norm': 0.5653843283653259, 'learning_rate': 1.6769245956464396e-06, 'epoch': 5.55}\n",
-      "{'loss': 0.1248, 'grad_norm': 2.8058314323425293, 'learning_rate': 1.5461356885461075e-06, 'epoch': 5.57}\n",
-      "{'loss': 0.1108, 'grad_norm': 1.7656151056289673, 'learning_rate': 1.4205762876726092e-06, 'epoch': 5.59}\n",
-      "{'loss': 0.103, 'grad_norm': 1.4396343231201172, 'learning_rate': 1.3002599443428243e-06, 'epoch': 5.6}\n",
-      "{'loss': 0.1239, 'grad_norm': 2.5784292221069336, 'learning_rate': 1.1851996440033319e-06, 'epoch': 5.62}\n",
-      "{'loss': 0.0961, 'grad_norm': 0.813414990901947, 'learning_rate': 1.0754078048289374e-06, 'epoch': 5.64}\n",
-      "{'loss': 0.1176, 'grad_norm': 2.7768945693969727, 'learning_rate': 9.708962763824048e-07, 'epoch': 5.66}\n",
-      "{'loss': 0.0784, 'grad_norm': 1.4548313617706299, 'learning_rate': 8.716763383355864e-07, 'epoch': 5.68}\n",
-      "{'loss': 0.0995, 'grad_norm': 1.4250032901763916, 'learning_rate': 7.777586992519959e-07, 'epoch': 5.69}\n",
-      "{'loss': 0.1014, 'grad_norm': 3.0032870769500732, 'learning_rate': 6.891534954310885e-07, 'epoch': 5.71}\n",
-      "{'loss': 0.0993, 'grad_norm': 3.392124891281128, 'learning_rate': 6.058702898142643e-07, 'epoch': 5.73}\n",
-      "{'loss': 0.0962, 'grad_norm': 2.9156267642974854, 'learning_rate': 5.279180709527765e-07, 'epoch': 5.75}\n",
-      "{'loss': 0.1407, 'grad_norm': 2.50022292137146, 'learning_rate': 4.553052520375911e-07, 'epoch': 5.77}\n",
-      "{'loss': 0.0832, 'grad_norm': 1.8522708415985107, 'learning_rate': 3.8803966999139684e-07, 'epoch': 5.78}\n",
-      "{'loss': 0.1036, 'grad_norm': 2.559648275375366, 'learning_rate': 3.261285846227868e-07, 'epoch': 5.8}\n",
-      "{'loss': 0.0851, 'grad_norm': 6.124639987945557, 'learning_rate': 2.6957867784270787e-07, 'epoch': 5.82}\n",
-      "{'loss': 0.0861, 'grad_norm': 1.8628261089324951, 'learning_rate': 2.1839605294330933e-07, 'epoch': 5.84}\n",
-      "{'loss': 0.1153, 'grad_norm': 2.0182836055755615, 'learning_rate': 1.725862339392259e-07, 'epoch': 5.85}\n",
-      "{'loss': 0.0913, 'grad_norm': 2.184485912322998, 'learning_rate': 1.3215416497138754e-07, 'epoch': 5.87}\n",
-      "{'loss': 0.132, 'grad_norm': 2.652066707611084, 'learning_rate': 9.710420977340762e-08, 'epoch': 5.89}\n",
-      "{'loss': 0.0822, 'grad_norm': 2.054509401321411, 'learning_rate': 6.744015120061509e-08, 'epoch': 5.91}\n",
-      "{'loss': 0.1632, 'grad_norm': 2.1160929203033447, 'learning_rate': 4.316519082179227e-08, 'epoch': 5.93}\n",
-      "{'loss': 0.0715, 'grad_norm': 3.3849403858184814, 'learning_rate': 2.4281948573617874e-08, 'epoch': 5.94}\n",
-      "{'loss': 0.1134, 'grad_norm': 3.3306052684783936, 'learning_rate': 1.0792462477909882e-08, 'epoch': 5.96}\n",
-      "{'loss': 0.1273, 'grad_norm': 2.356410026550293, 'learning_rate': 2.6981884216847884e-09, 'epoch': 5.98}\n",
-      "{'loss': 0.1189, 'grad_norm': 2.4627721309661865, 'learning_rate': 0.0, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [1:31:57<00:00,  1.60s/it][INFO|trainer.py:3788] 2024-07-05 07:48:24,113 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-05 07:48:24,113 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-05 07:48:24,113 >>   Batch size = 1\n",
-      "\n",
-      "  0%|                                                    | 0/46 [00:00<?, ?it/s]\u001b[A\n",
-      "  7%|██▊                                         | 3/46 [00:00<00:01, 23.35it/s]\u001b[A\n",
-      " 13%|█████▋                                      | 6/46 [00:00<00:02, 18.72it/s]\u001b[A\n",
-      " 17%|███████▋                                    | 8/46 [00:00<00:02, 18.62it/s]\u001b[A\n",
-      " 22%|█████████▎                                 | 10/46 [00:00<00:01, 18.07it/s]\u001b[A\n",
-      " 26%|███████████▏                               | 12/46 [00:00<00:01, 17.18it/s]\u001b[A\n",
-      " 30%|█████████████                              | 14/46 [00:00<00:01, 17.36it/s]\u001b[A\n",
-      " 35%|██████████████▉                            | 16/46 [00:00<00:01, 17.30it/s]\u001b[A\n",
-      " 39%|████████████████▊                          | 18/46 [00:01<00:01, 17.39it/s]\u001b[A\n",
-      " 43%|██████████████████▋                        | 20/46 [00:01<00:01, 17.07it/s]\u001b[A\n",
-      " 48%|████████████████████▌                      | 22/46 [00:01<00:01, 17.15it/s]\u001b[A\n",
-      " 52%|██████████████████████▍                    | 24/46 [00:01<00:01, 16.56it/s]\u001b[A\n",
-      " 57%|████████████████████████▎                  | 26/46 [00:01<00:01, 16.08it/s]\u001b[A\n",
-      " 61%|██████████████████████████▏                | 28/46 [00:01<00:01, 16.33it/s]\u001b[A\n",
-      " 65%|████████████████████████████               | 30/46 [00:01<00:01, 15.84it/s]\u001b[A\n",
-      " 70%|█████████████████████████████▉             | 32/46 [00:01<00:00, 16.53it/s]\u001b[A\n",
-      " 74%|███████████████████████████████▊           | 34/46 [00:02<00:00, 16.00it/s]\u001b[A\n",
-      " 78%|█████████████████████████████████▋         | 36/46 [00:02<00:00, 15.36it/s]\u001b[A\n",
-      " 83%|███████████████████████████████████▌       | 38/46 [00:02<00:00, 15.20it/s]\u001b[A\n",
-      " 87%|█████████████████████████████████████▍     | 40/46 [00:02<00:00, 15.28it/s]\u001b[A\n",
-      " 91%|███████████████████████████████████████▎   | 42/46 [00:02<00:00, 15.21it/s]\u001b[A\n",
-      " 96%|█████████████████████████████████████████▏ | 44/46 [00:02<00:00, 15.97it/s]\u001b[A\n",
-      "                                                                                \u001b[A\n",
-      "\u001b[A{'eval_loss': 3.542919397354126, 'eval_runtime': 2.8444, 'eval_samples_per_second': 16.172, 'eval_steps_per_second': 16.172, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [1:31:59<00:00,  1.60s/it]\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:02<00:00, 16.55it/s]\u001b[A\n",
-      "                                                                                \u001b[A[INFO|trainer.py:3478] 2024-07-05 07:48:26,960 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft/checkpoint-3360\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 07:48:28,128 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-0.5b-instruct-bnb-4bit/snapshots/c3b24ce4827d69f5c3bde9aba00047774069ab72/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 07:48:28,128 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-0.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "[INFO|<string>:482] 2024-07-05 07:48:28,348 >> \n",
-      "\n",
-      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
-      "\n",
-      "\n",
-      "{'train_runtime': 5527.5332, 'train_samples_per_second': 4.865, 'train_steps_per_second': 0.608, 'train_loss': 0.927943646074051, 'epoch': 6.0}\n",
-      "100%|█████████████████████████████████████| 3360/3360 [1:32:01<00:00,  1.64s/it]\n",
-      "[INFO|trainer.py:3478] 2024-07-05 07:48:28,351 >> Saving model checkpoint to saves/qwen2-0.5b/lora/sft\n",
-      "[INFO|configuration_utils.py:733] 2024-07-05 07:48:29,375 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--unsloth--qwen2-0.5b-instruct-bnb-4bit/snapshots/c3b24ce4827d69f5c3bde9aba00047774069ab72/config.json\n",
-      "[INFO|configuration_utils.py:800] 2024-07-05 07:48:29,376 >> Model config Qwen2Config {\n",
-      "  \"_name_or_path\": \"Qwen/Qwen2-0.5B-Instruct\",\n",
-      "  \"architectures\": [\n",
-      "    \"Qwen2ForCausalLM\"\n",
-      "  ],\n",
-      "  \"attention_dropout\": 0.0,\n",
-      "  \"bos_token_id\": 151643,\n",
-      "  \"eos_token_id\": 151645,\n",
-      "  \"hidden_act\": \"silu\",\n",
-      "  \"hidden_size\": 896,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 4864,\n",
-      "  \"max_position_embeddings\": 32768,\n",
-      "  \"max_window_layers\": 24,\n",
-      "  \"model_type\": \"qwen2\",\n",
-      "  \"num_attention_heads\": 14,\n",
-      "  \"num_hidden_layers\": 24,\n",
-      "  \"num_key_value_heads\": 2,\n",
-      "  \"quantization_config\": {\n",
-      "    \"_load_in_4bit\": true,\n",
-      "    \"_load_in_8bit\": false,\n",
-      "    \"bnb_4bit_compute_dtype\": \"bfloat16\",\n",
-      "    \"bnb_4bit_quant_storage\": \"uint8\",\n",
-      "    \"bnb_4bit_quant_type\": \"nf4\",\n",
-      "    \"bnb_4bit_use_double_quant\": true,\n",
-      "    \"llm_int8_enable_fp32_cpu_offload\": false,\n",
-      "    \"llm_int8_has_fp16_weight\": false,\n",
-      "    \"llm_int8_skip_modules\": null,\n",
-      "    \"llm_int8_threshold\": 6.0,\n",
-      "    \"load_in_4bit\": true,\n",
-      "    \"load_in_8bit\": false,\n",
-      "    \"quant_method\": \"bitsandbytes\"\n",
-      "  },\n",
-      "  \"rms_norm_eps\": 1e-06,\n",
-      "  \"rope_theta\": 1000000.0,\n",
-      "  \"sliding_window\": 32768,\n",
-      "  \"tie_word_embeddings\": true,\n",
-      "  \"torch_dtype\": \"bfloat16\",\n",
-      "  \"transformers_version\": \"4.42.3\",\n",
-      "  \"use_cache\": true,\n",
-      "  \"use_sliding_window\": false,\n",
-      "  \"vocab_size\": 151936\n",
-      "}\n",
-      "\n",
-      "***** train metrics *****\n",
-      "  epoch                    =     5.9973\n",
-      "  total_flos               =  6320365GF\n",
-      "  train_loss               =     0.9279\n",
-      "  train_runtime            = 1:32:07.53\n",
-      "  train_samples_per_second =      4.865\n",
-      "  train_steps_per_second   =      0.608\n",
-      "Figure saved at: saves/qwen2-0.5b/lora/sft/training_loss.png\n",
-      "Figure saved at: saves/qwen2-0.5b/lora/sft/training_eval_loss.png\n",
-      "[INFO|trainer.py:3788] 2024-07-05 07:48:29,751 >> \n",
-      "***** Running Evaluation *****\n",
-      "[INFO|trainer.py:3790] 2024-07-05 07:48:29,752 >>   Num examples = 46\n",
-      "[INFO|trainer.py:3793] 2024-07-05 07:48:29,752 >>   Batch size = 1\n",
-      "100%|███████████████████████████████████████████| 46/46 [00:03<00:00, 15.10it/s]\n",
-      "***** eval metrics *****\n",
-      "  epoch                   =     5.9973\n",
-      "  eval_loss               =     3.5429\n",
-      "  eval_runtime            = 0:00:03.16\n",
-      "  eval_samples_per_second =     14.532\n",
-      "  eval_steps_per_second   =     14.532\n",
-      "[INFO|modelcard.py:449] 2024-07-05 07:48:32,920 >> Dropping the following result as it does not have all the necessary fields:\n",
-      "{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: / 0.561 MB of 0.561 MB uploaded\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               eval/loss ▁▁▃▄▆██\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:            eval/runtime ▁▂▃▂▄▂█\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: eval/samples_per_second █▇▆▇▅▇▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:   eval/steps_per_second █▇▆▇▅▇▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:             train/epoch ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:       train/global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:         train/grad_norm ▂▁▁▂▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂█▂▂▂▂▁▁▁▂▁▁▁▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:     train/learning_rate ▂▄▅▇██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/loss ████▇▇▇▅▆▆▆▆▆▆▄▄▄▄▄▄▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:                eval/loss 3.54292\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:             eval/runtime 3.1655\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:  eval/samples_per_second 14.532\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:    eval/steps_per_second 14.532\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               total_flos 6786441021493248.0\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/epoch 5.99732\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:        train/global_step 3360\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:          train/grad_norm 2.46277\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:      train/learning_rate 0.0\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               train/loss 0.1189\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:               train_loss 0.92794\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:            train_runtime 5527.5332\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: train_samples_per_second 4.865\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m:   train_steps_per_second 0.608\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mqwen2_0.5b_lora_sft\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface/runs/3amepb0m\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at: \u001b[34m\u001b[4mhttps://wandb.ai/inflaton-ai/huggingface\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20240705_061623-3amepb0m/logs\u001b[0m\n",
-      "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The new W&B backend becomes opt-out in version 0.18.0; try it out with `wandb.require(\"core\")`! See https://wandb.me/wandb-core for more information.\n",
-      "CPU times: user 1min 4s, sys: 21.9 s, total: 1min 26s\n",
-      "Wall time: 1h 33min 9s\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "\n",
-    "!./scripts/tune-lf.sh config/qwen2_0.5b_lora_sft_unsloth.yaml"
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "application/vnd.databricks.v1+notebook": {
-   "dashboards": [],
-   "environmentMetadata": null,
-   "language": "python",
-   "notebookMetadata": {
-    "pythonIndentUnit": 4
-   },
-   "notebookName": "07_MAC_+_Qwen2-7B-Instructi_Unsloth_train",
-   "widgets": {}
-  },
-  "colab": {
-   "gpuType": "T4",
-   "provenance": []
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "036fc5746f43416db18c19ad8fd36677": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "06e806c82c7b4cbea31c5358dd9c3434": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "087b76a8b7514269b1f0ab29b062e444": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_a069d2ab23824f29aa320ac256e2cfe9",
-      "placeholder": "​",
-      "style": "IPY_MODEL_06e806c82c7b4cbea31c5358dd9c3434",
-      "value": "Map (num_proc=2): 100%"
-     }
-    },
-    "09b76013aa9e45efb6deb23a7a0d0925": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_dea41c5260884aa6879b5e1d1697b14f",
-      "placeholder": "​",
-      "style": "IPY_MODEL_89965917796a4f81b899fdc7685f33df",
-      "value": "config.json: 100%"
-     }
-    },
-    "0a92c56bfa134ef583220d7ef0b13e17": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "0c34be936c8145d3ab41282f30a70713": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "0f8b6bfe16894500838793f2491d403f": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "177c78fce95d4b4ab33057c5a048d693": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "1f44c9ce1adf470cbb19784493ed209f": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_0c34be936c8145d3ab41282f30a70713",
-      "placeholder": "​",
-      "style": "IPY_MODEL_0a92c56bfa134ef583220d7ef0b13e17",
-      "value": "model.safetensors: 100%"
-     }
-    },
-    "201b59ccd9f845e197029b57e424aefc": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "2157f01726d748f8a9ae4a00664430da": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "21db8a77b00d4a4e82fdfa608657531f": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "26e4202cca81496a90d15a0dd4ca9cf1": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_ba90fdb8822d47dab7ba203bee297f37",
-       "IPY_MODEL_61560ff6a36b44f4a9dfdae5c52791d4",
-       "IPY_MODEL_95fbe66647904c06a20f640630d6dc0e"
-      ],
-      "layout": "IPY_MODEL_57182a263d324a3dbf1471c74290a0d5"
-     }
-    },
-    "27155728b6b84cb199c91c940095d0a8": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_6b91feeed5464877991ac2c207aebe7c",
-       "IPY_MODEL_cca8113c54c0495daedce1327bf9c68b",
-       "IPY_MODEL_2e63a29e2f7247bba5beede9a568c99f"
-      ],
-      "layout": "IPY_MODEL_5c9d781c28944f3eb86e2a6d44efdf18"
-     }
-    },
-    "271ddaa553a042d09b6db7b450643d8f": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "2a58d04b428c46f4b3dbadd3bc6cd529": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "2d18ddf6482c4d97829ac0e5a7b9868f": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_9f679ad3ec7f4fe8ad0510ffb57bc2ab",
-       "IPY_MODEL_f2df530d22c74977b249dd9fb5f4829b",
-       "IPY_MODEL_89b2ef0dbfea47ab8e6f8d659e3351d1"
-      ],
-      "layout": "IPY_MODEL_3056b148aa9f4e6e8aa3b61d26886255"
-     }
-    },
-    "2e5087c76f98437cb5dc729230358cba": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "2e63a29e2f7247bba5beede9a568c99f": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_b993eaec6b224440bf80c0958c6fb536",
-      "placeholder": "​",
-      "style": "IPY_MODEL_de868e26e7154f62aa86223a539ad421",
-      "value": " 464/464 [00:00&lt;00:00, 27.1kB/s]"
-     }
-    },
-    "2f6c70dd266c4816bfad3fd3d192929a": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "30307300bc4e4baf96560e30969a82b6": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_e36a3f9eff0e4cf68834d66b0213ae96",
-      "placeholder": "​",
-      "style": "IPY_MODEL_a0037bdccf254159becde630bee3d1db",
-      "value": "generation_config.json: 100%"
-     }
-    },
-    "3056b148aa9f4e6e8aa3b61d26886255": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "30cdc32298134cb0be4d41615b9e5774": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "3572201bd4d74a58b7a665f9bdfdcdba": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "35b0e8c26d6640e9bd0ed7b242a423d8": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_2e5087c76f98437cb5dc729230358cba",
-      "max": 51760,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_036fc5746f43416db18c19ad8fd36677",
-      "value": 51760
-     }
-    },
-    "36166c7bcb854b34aca1f41a5d6ea50b": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "370692d819df41828b48c4ad446f977b": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "39b29a75374b45c0a22506010be2b84e": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_30cdc32298134cb0be4d41615b9e5774",
-      "max": 1179,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_47928317548c454bba6358ab132e8dee",
-      "value": 1179
-     }
-    },
-    "3cf2dd993b5e4d3daecf61e4bab5a404": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_087b76a8b7514269b1f0ab29b062e444",
-       "IPY_MODEL_35b0e8c26d6640e9bd0ed7b242a423d8",
-       "IPY_MODEL_54ad89e05fd74576b9b8b5b5a10eaf8d"
-      ],
-      "layout": "IPY_MODEL_a41dc44766444a998bec2d777f249d23"
-     }
-    },
-    "43dec2ede91341f5af60eb522e18e984": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "4463edd481c1467f914c7dcd6c6e6ffc": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "47928317548c454bba6358ab132e8dee": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "49277aeeac16434a865a4d12308b1abc": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "4ae7e449e4ea4c729b5f34607c18ebae": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "4b2061b8a73c43ffb0c2f83daf0d0183": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "4c4c88d4c701450692fa0f6b0c5764b0": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "4c666f4ace3943f8b80ecd20e7503236": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "4ccedf0d93094e63b57a0f8a434fba06": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_4463edd481c1467f914c7dcd6c6e6ffc",
-      "max": 44307561,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_6d3b9a05db0b4dadb638c686faa0c40a",
-      "value": 44307561
-     }
-    },
-    "4dcf6ff672d24983a1877a8431709aa9": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_5807d5fb827d490fb3bc698f801ffff5",
-      "placeholder": "​",
-      "style": "IPY_MODEL_c4f2b06a82fd4987b8b659524a7b503b",
-      "value": "Generating train split: 100%"
-     }
-    },
-    "4ea63adfce694725bdba878aef709dd3": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "5234566b1bfc4655b8d582ea5b46ed9f": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "54ad89e05fd74576b9b8b5b5a10eaf8d": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_fdb1941405ed4e4aa06019933892deb3",
-      "placeholder": "​",
-      "style": "IPY_MODEL_668d5377ca56426a99753867e6e24862",
-      "value": " 51760/51760 [01:02&lt;00:00, 1131.51 examples/s]"
-     }
-    },
-    "56aee4853b7740e6a977254f5d1fa66d": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "57182a263d324a3dbf1471c74290a0d5": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "5807d5fb827d490fb3bc698f801ffff5": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "5c9d781c28944f3eb86e2a6d44efdf18": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "5f40db8173dd4d76b6ef5ed6d9ec8b6e": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "61560ff6a36b44f4a9dfdae5c52791d4": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_db19fc8d37db4e45a5790a876836d8c4",
-      "max": 11610,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_36166c7bcb854b34aca1f41a5d6ea50b",
-      "value": 11610
-     }
-    },
-    "6578fd7acdb54c4c93528ea431fd0144": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_370692d819df41828b48c4ad446f977b",
-      "placeholder": "​",
-      "style": "IPY_MODEL_a0bf9160eb2647409b3200270914b90f",
-      "value": " 50.6k/50.6k [00:00&lt;00:00, 2.71MB/s]"
-     }
-    },
-    "668d5377ca56426a99753867e6e24862": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "697f027529b54ee9956bae78a11e0611": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "69ac12aec0714318bf2c83d4f4e745f5": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "6b2012c3f88547af8884a9ea90e3164b": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_938f45f1b3e24118b815d96ae34ba86a",
-      "placeholder": "​",
-      "style": "IPY_MODEL_9367047a800747f79c6b225d92397846",
-      "value": " 44.3M/44.3M [00:01&lt;00:00, 31.0MB/s]"
-     }
-    },
-    "6b91feeed5464877991ac2c207aebe7c": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_4b2061b8a73c43ffb0c2f83daf0d0183",
-      "placeholder": "​",
-      "style": "IPY_MODEL_69ac12aec0714318bf2c83d4f4e745f5",
-      "value": "special_tokens_map.json: 100%"
-     }
-    },
-    "6d3b9a05db0b4dadb638c686faa0c40a": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "6dbbedeca9314e66ae50e44ffa31a414": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "6e34619b45934040b6092e6fb01ea7fe": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "71ce208e20d6483abb9ed923510c86d7": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_d69dc491b3ab44d7852b21873ed7bb7f",
-      "placeholder": "​",
-      "style": "IPY_MODEL_f401d53bf28e44eb906bce6c05412662",
-      "value": " 51760/51760 [00:01&lt;00:00, 45512.81 examples/s]"
-     }
-    },
-    "7358cdad832342c983e31efb8754ab78": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "73e352a3404f4c7dad0737f57d29e92f": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_988a0e8c1f89446086858da0a891a79c",
-       "IPY_MODEL_4ccedf0d93094e63b57a0f8a434fba06",
-       "IPY_MODEL_6b2012c3f88547af8884a9ea90e3164b"
-      ],
-      "layout": "IPY_MODEL_7e29cb8dd4df4d5b94407cd8fd3f2011"
-     }
-    },
-    "74501720ac7e4dbb911a4a99b3633bc6": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "78e5400bff924a92a4cc61c4ff18b182": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_b9b313fd861948f5aba25b24b1518d30",
-      "placeholder": "​",
-      "style": "IPY_MODEL_4c666f4ace3943f8b80ecd20e7503236",
-      "value": " 1.18k/1.18k [00:00&lt;00:00, 31.3kB/s]"
-     }
-    },
-    "7975adbc2ec5489ea7fa0167e620d85c": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_6e34619b45934040b6092e6fb01ea7fe",
-      "max": 51760,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_271ddaa553a042d09b6db7b450643d8f",
-      "value": 51760
-     }
-    },
-    "7e29cb8dd4df4d5b94407cd8fd3f2011": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "810ff6c0e17d4fa09a30fef27eacff90": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "89965917796a4f81b899fdc7685f33df": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "89b2ef0dbfea47ab8e6f8d659e3351d1": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_b8908fa0df3743ecb9d12983a739104f",
-      "placeholder": "​",
-      "style": "IPY_MODEL_177c78fce95d4b4ab33057c5a048d693",
-      "value": " 9.09M/9.09M [00:00&lt;00:00, 32.6MB/s]"
-     }
-    },
-    "8b3505352a5a42bf910428c40ce40465": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_49277aeeac16434a865a4d12308b1abc",
-      "placeholder": "​",
-      "style": "IPY_MODEL_2157f01726d748f8a9ae4a00664430da",
-      "value": " 5.70G/5.70G [01:02&lt;00:00, 30.1MB/s]"
-     }
-    },
-    "8fc142b628fb40568730234de1cafde2": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_4ae7e449e4ea4c729b5f34607c18ebae",
-      "max": 172,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_3572201bd4d74a58b7a665f9bdfdcdba",
-      "value": 172
-     }
-    },
-    "9367047a800747f79c6b225d92397846": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "938f45f1b3e24118b815d96ae34ba86a": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "95fbe66647904c06a20f640630d6dc0e": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_b0a370dc20654b279b9680692e34418e",
-      "placeholder": "​",
-      "style": "IPY_MODEL_cfeb365ddf7548d58b2557f22737fcf5",
-      "value": " 11.6k/11.6k [00:00&lt;00:00, 716kB/s]"
-     }
-    },
-    "988a0e8c1f89446086858da0a891a79c": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_ad2be500fc164c0f86f33e914ef8e6a0",
-      "placeholder": "​",
-      "style": "IPY_MODEL_5234566b1bfc4655b8d582ea5b46ed9f",
-      "value": "Downloading data: 100%"
-     }
-    },
-    "98c58f23f4d549518832cb2d18f796e8": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_09b76013aa9e45efb6deb23a7a0d0925",
-       "IPY_MODEL_39b29a75374b45c0a22506010be2b84e",
-       "IPY_MODEL_78e5400bff924a92a4cc61c4ff18b182"
-      ],
-      "layout": "IPY_MODEL_2a58d04b428c46f4b3dbadd3bc6cd529"
-     }
-    },
-    "99fdbb0300c14c139d1937c646f0cfe7": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_7358cdad832342c983e31efb8754ab78",
-      "placeholder": "​",
-      "style": "IPY_MODEL_e9adf418296e436fb48bb9f78885598b",
-      "value": " 51760/51760 [00:01&lt;00:00, 38665.95 examples/s]"
-     }
-    },
-    "9f679ad3ec7f4fe8ad0510ffb57bc2ab": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_4ea63adfce694725bdba878aef709dd3",
-      "placeholder": "​",
-      "style": "IPY_MODEL_74501720ac7e4dbb911a4a99b3633bc6",
-      "value": "tokenizer.json: 100%"
-     }
-    },
-    "a0037bdccf254159becde630bee3d1db": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "a069d2ab23824f29aa320ac256e2cfe9": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "a0bf9160eb2647409b3200270914b90f": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "a41dc44766444a998bec2d777f249d23": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "a8464a4c711e4e00aafdfc919b60d07e": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_fb995c740590427b882572c81d4e848c",
-      "placeholder": "​",
-      "style": "IPY_MODEL_201b59ccd9f845e197029b57e424aefc",
-      "value": " 172/172 [00:00&lt;00:00, 12.0kB/s]"
-     }
-    },
-    "a9f0cc51fc3d4d7b874c32dcf1c5bdf2": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "ad2be500fc164c0f86f33e914ef8e6a0": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "b0240cd9a4554b29ae11f8051984a1c6": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_edaf890370314a218f138015faa0b05d",
-      "placeholder": "​",
-      "style": "IPY_MODEL_697f027529b54ee9956bae78a11e0611",
-      "value": "Map: 100%"
-     }
-    },
-    "b0a370dc20654b279b9680692e34418e": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "b518dcee69074b87be73957cd810e7ed": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_d891f8d0b1fc462f8008d02bb2a15692",
-      "placeholder": "​",
-      "style": "IPY_MODEL_cced8fd7e998472794f3f3e3018956a5",
-      "value": "tokenizer_config.json: 100%"
-     }
-    },
-    "b8908fa0df3743ecb9d12983a739104f": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "b993eaec6b224440bf80c0958c6fb536": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "b9b313fd861948f5aba25b24b1518d30": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "ba90fdb8822d47dab7ba203bee297f37": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HTMLModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HTMLModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HTMLView",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_0f8b6bfe16894500838793f2491d403f",
-      "placeholder": "​",
-      "style": "IPY_MODEL_bb19f6c747754682a514373a3a0535ba",
-      "value": "Downloading readme: 100%"
-     }
-    },
-    "bb19f6c747754682a514373a3a0535ba": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "bc883d4cf13e4f8b8a4fe5f410cb6efd": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_e9159e03e61f4f56978ece9c3bca49b2",
-      "max": 51760,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_810ff6c0e17d4fa09a30fef27eacff90",
-      "value": 51760
-     }
-    },
-    "c161d94df0f04feba9542237e0856c22": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "c22f71b1f85843209d7e5321506b9cb9": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_1f44c9ce1adf470cbb19784493ed209f",
-       "IPY_MODEL_f1addc4479d849879e743cf9089e6540",
-       "IPY_MODEL_8b3505352a5a42bf910428c40ce40465"
-      ],
-      "layout": "IPY_MODEL_4c4c88d4c701450692fa0f6b0c5764b0"
-     }
-    },
-    "c4f2b06a82fd4987b8b659524a7b503b": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "cca8113c54c0495daedce1327bf9c68b": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_e02f9b7849c64531835eb77b860d1c93",
-      "max": 464,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_56aee4853b7740e6a977254f5d1fa66d",
-      "value": 464
-     }
-    },
-    "cced8fd7e998472794f3f3e3018956a5": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "cf245afeb1c04f29a24d291608c3d157": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_b518dcee69074b87be73957cd810e7ed",
-       "IPY_MODEL_e29104486d594b2992d7285e0ef77371",
-       "IPY_MODEL_6578fd7acdb54c4c93528ea431fd0144"
-      ],
-      "layout": "IPY_MODEL_d35db8148a354c56aaac56dbae22536f"
-     }
-    },
-    "cfe8cae0e22b495bafa221a63d13b283": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "cfeb365ddf7548d58b2557f22737fcf5": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "d1b47d39450d4019ae85c9b2f943eeaf": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_4dcf6ff672d24983a1877a8431709aa9",
-       "IPY_MODEL_7975adbc2ec5489ea7fa0167e620d85c",
-       "IPY_MODEL_71ce208e20d6483abb9ed923510c86d7"
-      ],
-      "layout": "IPY_MODEL_cfe8cae0e22b495bafa221a63d13b283"
-     }
-    },
-    "d35db8148a354c56aaac56dbae22536f": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "d69dc491b3ab44d7852b21873ed7bb7f": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "d891f8d0b1fc462f8008d02bb2a15692": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "d8e5318cead340c4adbeaccc05d39225": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "ProgressStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "ProgressStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "bar_color": null,
-      "description_width": ""
-     }
-    },
-    "daf4cd890b35422683d22fd30bc71e83": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_b0240cd9a4554b29ae11f8051984a1c6",
-       "IPY_MODEL_bc883d4cf13e4f8b8a4fe5f410cb6efd",
-       "IPY_MODEL_99fdbb0300c14c139d1937c646f0cfe7"
-      ],
-      "layout": "IPY_MODEL_c161d94df0f04feba9542237e0856c22"
-     }
-    },
-    "db19fc8d37db4e45a5790a876836d8c4": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "de868e26e7154f62aa86223a539ad421": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "dea41c5260884aa6879b5e1d1697b14f": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "e02f9b7849c64531835eb77b860d1c93": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "e29104486d594b2992d7285e0ef77371": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_a9f0cc51fc3d4d7b874c32dcf1c5bdf2",
-      "max": 50641,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_2f6c70dd266c4816bfad3fd3d192929a",
-      "value": 50641
-     }
-    },
-    "e36a3f9eff0e4cf68834d66b0213ae96": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "e9159e03e61f4f56978ece9c3bca49b2": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "e9adf418296e436fb48bb9f78885598b": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "edaf890370314a218f138015faa0b05d": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "f1addc4479d849879e743cf9089e6540": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_43dec2ede91341f5af60eb522e18e984",
-      "max": 5702746405,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_d8e5318cead340c4adbeaccc05d39225",
-      "value": 5702746405
-     }
-    },
-    "f2df530d22c74977b249dd9fb5f4829b": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "FloatProgressModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "FloatProgressModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "ProgressView",
-      "bar_style": "success",
-      "description": "",
-      "description_tooltip": null,
-      "layout": "IPY_MODEL_21db8a77b00d4a4e82fdfa608657531f",
-      "max": 9085698,
-      "min": 0,
-      "orientation": "horizontal",
-      "style": "IPY_MODEL_6dbbedeca9314e66ae50e44ffa31a414",
-      "value": 9085698
-     }
-    },
-    "f401d53bf28e44eb906bce6c05412662": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "DescriptionStyleModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "DescriptionStyleModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "StyleView",
-      "description_width": ""
-     }
-    },
-    "fb995c740590427b882572c81d4e848c": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    },
-    "fce7a61c25ec4390af43d92b7c473a45": {
-     "model_module": "@jupyter-widgets/controls",
-     "model_module_version": "1.5.0",
-     "model_name": "HBoxModel",
-     "state": {
-      "_dom_classes": [],
-      "_model_module": "@jupyter-widgets/controls",
-      "_model_module_version": "1.5.0",
-      "_model_name": "HBoxModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/controls",
-      "_view_module_version": "1.5.0",
-      "_view_name": "HBoxView",
-      "box_style": "",
-      "children": [
-       "IPY_MODEL_30307300bc4e4baf96560e30969a82b6",
-       "IPY_MODEL_8fc142b628fb40568730234de1cafde2",
-       "IPY_MODEL_a8464a4c711e4e00aafdfc919b60d07e"
-      ],
-      "layout": "IPY_MODEL_5f40db8173dd4d76b6ef5ed6d9ec8b6e"
-     }
-    },
-    "fdb1941405ed4e4aa06019933892deb3": {
-     "model_module": "@jupyter-widgets/base",
-     "model_module_version": "1.2.0",
-     "model_name": "LayoutModel",
-     "state": {
-      "_model_module": "@jupyter-widgets/base",
-      "_model_module_version": "1.2.0",
-      "_model_name": "LayoutModel",
-      "_view_count": null,
-      "_view_module": "@jupyter-widgets/base",
-      "_view_module_version": "1.2.0",
-      "_view_name": "LayoutView",
-      "align_content": null,
-      "align_items": null,
-      "align_self": null,
-      "border": null,
-      "bottom": null,
-      "display": null,
-      "flex": null,
-      "flex_flow": null,
-      "grid_area": null,
-      "grid_auto_columns": null,
-      "grid_auto_flow": null,
-      "grid_auto_rows": null,
-      "grid_column": null,
-      "grid_gap": null,
-      "grid_row": null,
-      "grid_template_areas": null,
-      "grid_template_columns": null,
-      "grid_template_rows": null,
-      "height": null,
-      "justify_content": null,
-      "justify_items": null,
-      "left": null,
-      "margin": null,
-      "max_height": null,
-      "max_width": null,
-      "min_height": null,
-      "min_width": null,
-      "object_fit": null,
-      "object_position": null,
-      "order": null,
-      "overflow": null,
-      "overflow_x": null,
-      "overflow_y": null,
-      "padding": null,
-      "right": null,
-      "top": null,
-      "visibility": null,
-      "width": null
-     }
-    }
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}