diff --git "a/competition/08c_InterLM_finetuning_NV4080_p2.ipynb" "b/competition/08c_InterLM_finetuning_NV4080_p2.ipynb" --- "a/competition/08c_InterLM_finetuning_NV4080_p2.ipynb" +++ "b/competition/08c_InterLM_finetuning_NV4080_p2.ipynb" @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -15,7 +15,15 @@ "title": "" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python 3.11.9\n" + ] + } + ], "source": [ "if 'dbutils' in locals():\n", " dbutils.library.restartPython()\n", @@ -25,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -46,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -59,7 +67,15 @@ "title": "" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "workding dir: /home/inflaton/code/projects/courses/logical-reasoning\n" + ] + } + ], "source": [ "import os\n", "import sys\n", @@ -73,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -86,7 +102,18 @@ "title": "" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "need_to_setup_env = False\n", "need_to_setup_env" @@ -94,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -119,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -132,7 +159,15 @@ "title": "" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "workding dir: /home/inflaton/code/projects/courses/logical-reasoning\n" + ] + } + ], "source": [ "os.chdir(workding_dir)\n", "sys.path.append(workding_dir)\n", @@ -141,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -154,7 +189,25 @@ "title": "" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading env vars from: /home/inflaton/code/projects/courses/logical-reasoning/.env\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from dotenv import find_dotenv, load_dotenv\n", "\n", @@ -168,7 +221,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -181,7 +234,15 @@ "title": "" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "internlm/internlm2_5-7b-chat-1m None True datasets/mgtv results/mgtv-results_nv4080_p2.csv\n" + ] + } + ], "source": [ "import os\n", "\n", @@ -197,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -210,14 +271,41 @@ "title": "" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Thu Jul 11 13:53:26 2024 \n", + "+---------------------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 545.23.07 Driver Version: 546.12 CUDA Version: 12.3 |\n", + "|-----------------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|=========================================+======================+======================|\n", + "| 0 NVIDIA GeForce RTX 4080 ... On | 00000000:01:00.0 Off | N/A |\n", + "| N/A 52C P8 3W / 150W | 0MiB / 12282MiB | 0% Default |\n", + "| | | N/A |\n", + "+-----------------------------------------+----------------------+----------------------+\n", + " \n", + "+---------------------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=======================================================================================|\n", + "| No running processes found |\n", + "+---------------------------------------------------------------------------------------+\n" + ] + } + ], "source": [ "!nvidia-smi" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -230,7 +318,18 @@ "title": "" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python 3.11.9\n", + "\u001b[33mWARNING: Package(s) not found: flash-attn\u001b[0m\u001b[33m\n", + "\u001b[0mCPU times: user 9.31 ms, sys: 0 ns, total: 9.31 ms\n", + "Wall time: 553 ms\n" + ] + } + ], "source": [ "%%time\n", "!python --version\n", @@ -239,9 +338,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading /home/inflaton/code/projects/courses/logical-reasoning/llm_toolkit/logical_reasoning_utils.py\n" + ] + } + ], "source": [ "import os\n", "import pandas as pd\n", @@ -274,9 +381,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading existing data from: llama-factory/data/alpaca_mgtv_p2.json\n", + "--------------------------------------------------\n", + "instruction: 你是一个情景猜谜游戏的主持人。游戏规则如下:\n", + "\n", + "1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\n", + "2. 主持人知道谜底,谜底是谜面的答案。\n", + "3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n", + "4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\n", + " - 若谜面和谜底能找到问题的答案,回答:是或者不是\n", + " - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\n", + " - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\n", + " - 若参与者提问基本还原了谜底真相,回答:回答正确\n", + "5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\n", + "\n", + "请严格按照这些规则回答参与者提出的问题。\n", + "\n", + "**谜面:** 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n", + "\n", + "**谜底:** 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n", + "\n", + "**参与者提出的问题:** 偷的人信神吗\n", + "\n", + "--------------------------------------------------\n", + "input: \n", + "--------------------------------------------------\n", + "output: 不是\n" + ] + } + ], "source": [ "df_alpaca = load_data()\n", "print_row_details(df_alpaca)" @@ -284,9 +424,861 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current Directory:\n", + "/home/inflaton/code/projects/courses/logical-reasoning/llama-factory\n", + "config/internlm2_5_7b_lora_sft_4bit_p2.yaml:\n", + " {\n", + " \"model_name_or_path\": \"internlm/internlm2_5-7b-chat-1m\",\n", + " \"stage\": \"sft\",\n", + " \"do_train\": true,\n", + " \"finetuning_type\": \"lora\",\n", + " \"lora_target\": \"all\",\n", + " \"quantization_bit\": 4,\n", + " \"loraplus_lr_ratio\": 16.0,\n", + " \"upcast_layernorm\": true,\n", + " \"dataset\": \"alpaca_mgtv_p2\",\n", + " \"template\": \"chatml\",\n", + " \"cutoff_len\": 1024,\n", + " \"max_samples\": 5000,\n", + " \"overwrite_cache\": true,\n", + " \"preprocessing_num_workers\": 16,\n", + " \"output_dir\": \"saves/internlm2_5_7b/lora/sft_p2\",\n", + " \"logging_steps\": 100,\n", + " \"save_steps\": 562,\n", + " \"plot_loss\": true,\n", + " \"overwrite_output_dir\": true,\n", + " \"per_device_train_batch_size\": 1,\n", + " \"gradient_accumulation_steps\": 8,\n", + " \"learning_rate\": 0.0001,\n", + " \"num_train_epochs\": 6.0,\n", + " \"lr_scheduler_type\": \"cosine\",\n", + " \"warmup_ratio\": 0.1,\n", + " \"bf16\": true,\n", + " \"ddp_timeout\": 180000000,\n", + " \"val_size\": 0.1,\n", + " \"per_device_eval_batch_size\": 1,\n", + " \"eval_strategy\": \"steps\",\n", + " \"eval_steps\": 562,\n", + " \"report_to\": \"none\",\n", + " \"run_name\": \"internlm2_5_7b\"\n", + "}\n", + "07/11/2024 13:53:37 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16\n", + "[INFO|tokenization_utils_base.py:2161] 2024-07-11 13:53:43,438 >> loading file ./tokenizer.model from cache at /home/inflaton/.cache/huggingface/hub/models--internlm--internlm2_5-7b-chat-1m/snapshots/8d1a709a04d71440ef3df6ebbe204672f411c8b6/./tokenizer.model\n", + "[INFO|tokenization_utils_base.py:2161] 2024-07-11 13:53:43,438 >> loading file added_tokens.json from cache at None\n", + "[INFO|tokenization_utils_base.py:2161] 2024-07-11 13:53:43,438 >> loading file special_tokens_map.json from cache at /home/inflaton/.cache/huggingface/hub/models--internlm--internlm2_5-7b-chat-1m/snapshots/8d1a709a04d71440ef3df6ebbe204672f411c8b6/special_tokens_map.json\n", + "[INFO|tokenization_utils_base.py:2161] 2024-07-11 13:53:43,438 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--internlm--internlm2_5-7b-chat-1m/snapshots/8d1a709a04d71440ef3df6ebbe204672f411c8b6/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2161] 2024-07-11 13:53:43,438 >> loading file tokenizer.json from cache at None\n", + "07/11/2024 13:53:44 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>\n", + "07/11/2024 13:53:44 - INFO - llamafactory.data.template - Add <|im_start|> to stop words.\n", + "07/11/2024 13:53:44 - INFO - llamafactory.data.loader - Loading dataset alpaca_mgtv_p2.json...\n", + "Generating train split: 25000 examples [00:00, 43158.06 examples/s]\n", + "Converting format of dataset (num_proc=16): 100%|█| 5000/5000 [00:00<00:00, 1422\n", + "Running tokenizer on dataset (num_proc=16): 100%|█| 5000/5000 [00:01<00:00, 3141\n", + "input_ids:\n", + "[92543, 1008, 364, 60403, 68625, 77794, 62591, 63352, 68309, 69323, 60687, 60364, 60355, 68309, 69776, 68411, 60387, 402, 312, 281, 262, 69102, 60497, 60382, 89428, 63352, 60388, 60353, 63352, 60388, 60382, 69401, 68252, 87114, 70436, 68865, 82168, 60355, 364, 314, 281, 262, 74243, 68290, 63352, 60930, 60353, 63352, 60930, 60357, 63352, 68421, 69059, 60355, 364, 308, 281, 262, 69102, 60497, 68251, 73477, 68574, 74004, 60550, 68287, 89214, 61683, 88840, 73687, 60355, 364, 319, 281, 262, 68390, 68772, 68287, 60353, 74243, 60530, 68420, 74740, 68855, 68544, 72719, 68423, 68538, 60387, 60357, 60359, 68278, 60359, 82568, 60359, 68855, 69077, 60359, 60593, 60408, 69583, 60355, 60684, 68855, 60354, 69844, 68559, 68411, 60387, 364, 393, 285, 262, 61369, 63352, 81953, 63352, 60930, 91085, 70670, 69059, 60353, 68855, 60387, 60357, 68319, 68278, 364, 393, 285, 262, 61369, 63352, 81953, 63352, 60930, 68336, 68376, 68319, 80078, 60876, 61015, 60389, 70670, 69059, 60353, 68855, 60387, 82568, 364, 393, 285, 262, 61369, 69102, 60497, 73912, 79865, 74004, 60550, 68287, 68319, 68287, 70436, 68865, 60353, 68855, 60387, 60593, 60408, 69583, 364, 393, 285, 262, 61369, 69102, 60497, 73912, 68406, 71940, 60362, 63352, 60930, 73687, 60353, 68855, 60387, 68855, 69077, 364, 317, 281, 262, 68855, 60366, 68336, 68535, 68574, 69344, 68347, 60353, 71452, 81256, 68423, 68322, 78818, 60666, 60355, 69192, 60353, 73263, 60581, 60419, 68278, 60420, 81256, 60397, 60419, 60358, 60420, 60355, 402, 60836, 86910, 68374, 69776, 68855, 69102, 60497, 74743, 68287, 60355, 402, 465, 63352, 60388, 334, 465, 262, 60361, 63840, 60396, 78165, 60353, 68935, 79406, 70952, 60387, 69731, 71150, 88982, 82620, 60353, 71150, 61329, 60425, 60649, 68935, 69410, 71150, 60382, 60358, 62273, 60458, 61217, 60353, 71479, 60400, 72593, 69380, 79594, 90209, 60355, 60836, 75326, 71150, 82066, 79202, 68540, 60355, 402, 465, 63352, 60930, 334, 465, 262, 73687, 69607, 60510, 70226, 60372, 62650, 60354, 61044, 61066, 69045, 60355, 71389, 61044, 61066, 89463, 60353, 61002, 60510, 70226, 73027, 70134, 60544, 61422, 60355, 68310, 74907, 60361, 71150, 88982, 82620, 68980, 60355, 69104, 60353, 71062, 61976, 60364, 60353, 70134, 60361, 72325, 60463, 68294, 60612, 70623, 60366, 60877, 60668, 60355, 74726, 60354, 61044, 61066, 68394, 70367, 60447, 69126, 70134, 60353, 69731, 68549, 60530, 69410, 71150, 61882, 60825, 60353, 70395, 70134, 60354, 62296, 60463, 60353, 72069, 86407, 68304, 63024, 60880, 60355, 68597, 68891, 73936, 60362, 69372, 60353, 71093, 72276, 60425, 68252, 82569, 70952, 60355, 402, 465, 69102, 60497, 74743, 68287, 334, 465, 262, 61882, 68279, 60548, 60780, 61076, 364, 92542, 364, 92543, 525, 11353, 364, 68278, 92542]\n", + "inputs:\n", + "<|im_start|>user\n", + "你是一个情景猜谜游戏的主持人。游戏规则如下:\n", + "\n", + "1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\n", + "2. 主持人知道谜底,谜底是谜面的答案。\n", + "3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n", + "4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\n", + " - 若谜面和谜底能找到问题的答案,回答:是或者不是\n", + " - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\n", + " - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\n", + " - 若参与者提问基本还原了谜底真相,回答:回答正确\n", + "5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\n", + "\n", + "请严格按照这些规则回答参与者提出的问题。\n", + "\n", + "**谜面:** 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n", + "\n", + "**谜底:** 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n", + "\n", + "**参与者提出的问题:** 偷的人信神吗\n", + "<|im_end|>\n", + "<|im_start|>assistant\n", + "不是<|im_end|>\n", + "label_ids:\n", + "[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 68278, 92542]\n", + "labels:\n", + "不是<|im_end|>\n", + "[INFO|configuration_utils.py:733] 2024-07-11 13:53:56,968 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--internlm--internlm2_5-7b-chat-1m/snapshots/8d1a709a04d71440ef3df6ebbe204672f411c8b6/config.json\n", + "[INFO|configuration_utils.py:733] 2024-07-11 13:54:00,093 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--internlm--internlm2_5-7b-chat-1m/snapshots/8d1a709a04d71440ef3df6ebbe204672f411c8b6/config.json\n", + "[INFO|configuration_utils.py:800] 2024-07-11 13:54:00,094 >> Model config InternLM2Config {\n", + " \"_name_or_path\": \"internlm/internlm2_5-7b-chat-1m\",\n", + " \"architectures\": [\n", + " \"InternLM2ForCausalLM\"\n", + " ],\n", + " \"attn_implementation\": \"eager\",\n", + " \"auto_map\": {\n", + " \"AutoConfig\": \"internlm/internlm2_5-7b-chat-1m--configuration_internlm2.InternLM2Config\",\n", + " \"AutoModel\": \"internlm/internlm2_5-7b-chat-1m--modeling_internlm2.InternLM2ForCausalLM\",\n", + " \"AutoModelForCausalLM\": \"internlm/internlm2_5-7b-chat-1m--modeling_internlm2.InternLM2ForCausalLM\"\n", + " },\n", + " \"bias\": false,\n", + " \"bos_token_id\": 1,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"silu\",\n", + " \"hidden_size\": 4096,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 14336,\n", + " \"max_position_embeddings\": 262144,\n", + " \"model_type\": \"internlm2\",\n", + " \"num_attention_heads\": 32,\n", + " \"num_hidden_layers\": 32,\n", + " \"num_key_value_heads\": 8,\n", + " \"pad_token_id\": 2,\n", + " \"pretraining_tp\": 1,\n", + " \"rms_norm_eps\": 1e-05,\n", + " \"rope_scaling\": {\n", + " \"factor\": 2.5,\n", + " \"type\": \"dynamic\"\n", + " },\n", + " \"rope_theta\": 50000000,\n", + " \"tie_word_embeddings\": false,\n", + " \"torch_dtype\": \"bfloat16\",\n", + " \"transformers_version\": \"4.42.3\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 92544\n", + "}\n", + "\n", + "07/11/2024 13:54:00 - INFO - llamafactory.model.model_utils.quantization - Quantizing model to 4 bit with bitsandbytes.\n", + "[INFO|modeling_utils.py:3556] 2024-07-11 13:54:02,069 >> loading weights file model.safetensors from cache at /home/inflaton/.cache/huggingface/hub/models--internlm--internlm2_5-7b-chat-1m/snapshots/8d1a709a04d71440ef3df6ebbe204672f411c8b6/model.safetensors.index.json\n", + "[INFO|modeling_utils.py:1531] 2024-07-11 13:54:02,217 >> Instantiating InternLM2ForCausalLM model under default dtype torch.bfloat16.\n", + "[INFO|configuration_utils.py:1000] 2024-07-11 13:54:02,217 >> Generate config GenerationConfig {\n", + " \"bos_token_id\": 1,\n", + " \"eos_token_id\": 2,\n", + " \"pad_token_id\": 2\n", + "}\n", + "\n", + "Loading checkpoint shards: 100%|██████████████████| 8/8 [11:33<00:00, 86.64s/it]\n", + "[INFO|modeling_utils.py:4364] 2024-07-11 14:05:35,470 >> All model checkpoint weights were used when initializing InternLM2ForCausalLM.\n", + "\n", + "[INFO|modeling_utils.py:4372] 2024-07-11 14:05:35,470 >> All the weights of InternLM2ForCausalLM were initialized from the model checkpoint at internlm/internlm2_5-7b-chat-1m.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use InternLM2ForCausalLM for predictions without further training.\n", + "[INFO|configuration_utils.py:955] 2024-07-11 14:05:36,035 >> loading configuration file generation_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--internlm--internlm2_5-7b-chat-1m/snapshots/8d1a709a04d71440ef3df6ebbe204672f411c8b6/generation_config.json\n", + "[INFO|configuration_utils.py:1000] 2024-07-11 14:05:36,035 >> Generate config GenerationConfig {\n", + " \"bos_token_id\": 1,\n", + " \"eos_token_id\": [\n", + " 2,\n", + " 92542\n", + " ],\n", + " \"pad_token_id\": 2\n", + "}\n", + "\n", + "07/11/2024 14:05:36 - INFO - llamafactory.model.model_utils.checkpointing - Upcasting layernorm weights in float32.\n", + "07/11/2024 14:05:36 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.\n", + "07/11/2024 14:05:36 - INFO - llamafactory.model.model_utils.attention - Using vanilla attention implementation.\n", + "07/11/2024 14:05:36 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.\n", + "07/11/2024 14:05:36 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA\n", + "07/11/2024 14:05:36 - INFO - llamafactory.model.model_utils.misc - Found linear modules: w1,wqkv,w3,wo,w2\n", + "07/11/2024 14:05:36 - INFO - llamafactory.model.loader - trainable params: 18,874,368 || all params: 7,756,582,912 || trainable%: 0.2433\n", + "[INFO|trainer.py:642] 2024-07-11 14:05:36,681 >> Using auto half precision backend\n", + "07/11/2024 14:05:36 - INFO - llamafactory.train.trainer_utils - Using LoRA+ optimizer with loraplus lr ratio 16.00.\n", + "[INFO|trainer.py:2128] 2024-07-11 14:05:36,907 >> ***** Running training *****\n", + "[INFO|trainer.py:2129] 2024-07-11 14:05:36,907 >> Num examples = 4,500\n", + "[INFO|trainer.py:2130] 2024-07-11 14:05:36,907 >> Num Epochs = 6\n", + "[INFO|trainer.py:2131] 2024-07-11 14:05:36,907 >> Instantaneous batch size per device = 1\n", + "[INFO|trainer.py:2134] 2024-07-11 14:05:36,907 >> Total train batch size (w. parallel, distributed & accumulation) = 8\n", + "[INFO|trainer.py:2135] 2024-07-11 14:05:36,907 >> Gradient Accumulation steps = 8\n", + "[INFO|trainer.py:2136] 2024-07-11 14:05:36,907 >> Total optimization steps = 3,372\n", + "[INFO|trainer.py:2137] 2024-07-11 14:05:36,909 >> Number of trainable parameters = 18,874,368\n", + "{'loss': 0.4566, 'grad_norm': 2.716310501098633, 'learning_rate': 2.958579881656805e-05, 'epoch': 0.18}\n", + "{'loss': 0.3644, 'grad_norm': 4.124796390533447, 'learning_rate': 5.91715976331361e-05, 'epoch': 0.36}\n", + "{'loss': 0.3789, 'grad_norm': 2.9282803535461426, 'learning_rate': 8.875739644970414e-05, 'epoch': 0.53}\n", + "{'loss': 0.37, 'grad_norm': 2.6953532695770264, 'learning_rate': 9.989699867437137e-05, 'epoch': 0.71}\n", + "{'loss': 0.3485, 'grad_norm': 6.001204013824463, 'learning_rate': 9.92981892269398e-05, 'epoch': 0.89}\n", + " 17%|█████▊ | 562/3372 [2:40:33<13:23:16, 17.15s/it][INFO|trainer.py:3788] 2024-07-11 16:46:10,114 >> \n", + "***** Running Evaluation *****\n", + "[INFO|trainer.py:3790] 2024-07-11 16:46:10,115 >> Num examples = 500\n", + "[INFO|trainer.py:3793] 2024-07-11 16:46:10,115 >> Batch size = 1\n", + "\n", + " 0%| | 0/500 [00:00> Saving model checkpoint to saves/internlm2_5_7b/lora/sft_p2/checkpoint-562\n", + "[INFO|configuration_utils.py:733] 2024-07-11 16:52:57,877 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--internlm--internlm2_5-7b-chat-1m/snapshots/8d1a709a04d71440ef3df6ebbe204672f411c8b6/config.json\n", + "[INFO|configuration_utils.py:800] 2024-07-11 16:52:57,878 >> Model config InternLM2Config {\n", + " \"architectures\": [\n", + " \"InternLM2ForCausalLM\"\n", + " ],\n", + " \"attn_implementation\": \"eager\",\n", + " \"auto_map\": {\n", + " \"AutoConfig\": \"internlm/internlm2_5-7b-chat-1m--configuration_internlm2.InternLM2Config\",\n", + " \"AutoModel\": \"internlm/internlm2_5-7b-chat-1m--modeling_internlm2.InternLM2ForCausalLM\",\n", + " \"AutoModelForCausalLM\": \"internlm/internlm2_5-7b-chat-1m--modeling_internlm2.InternLM2ForCausalLM\"\n", + " },\n", + " \"bias\": false,\n", + " \"bos_token_id\": 1,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"silu\",\n", + " \"hidden_size\": 4096,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 14336,\n", + " \"max_position_embeddings\": 262144,\n", + " \"model_type\": \"internlm2\",\n", + " \"num_attention_heads\": 32,\n", + " \"num_hidden_layers\": 32,\n", + " \"num_key_value_heads\": 8,\n", + " \"pad_token_id\": 2,\n", + " \"pretraining_tp\": 1,\n", + " \"rms_norm_eps\": 1e-05,\n", + " \"rope_scaling\": {\n", + " \"factor\": 2.5,\n", + " \"type\": \"dynamic\"\n", + " },\n", + " \"rope_theta\": 50000000,\n", + " \"tie_word_embeddings\": false,\n", + " \"torch_dtype\": \"bfloat16\",\n", + " \"transformers_version\": \"4.42.3\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 92544\n", + "}\n", + "\n", + "[INFO|tokenization_utils_base.py:2574] 2024-07-11 16:52:58,318 >> tokenizer config file saved in saves/internlm2_5_7b/lora/sft_p2/checkpoint-562/tokenizer_config.json\n", + "[INFO|tokenization_utils_base.py:2583] 2024-07-11 16:52:58,318 >> Special tokens file saved in saves/internlm2_5_7b/lora/sft_p2/checkpoint-562/special_tokens_map.json\n", + "{'loss': 0.3563, 'grad_norm': 5.8297953605651855, 'learning_rate': 9.817128546774103e-05, 'epoch': 1.07}\n", + " 18%|██████▍ | 619/3372 [3:03:42<13:13:23, 17.29s/it]^C\n", + "Traceback (most recent call last):\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/bin/llamafactory-cli\", line 8, in \n", + " sys.exit(main())\n", + " ^^^^^^\n", + " File \"/home/inflaton/code/projects/courses/LLaMA-Factory/src/llamafactory/cli.py\", line 111, in main\n", + " run_exp()\n", + " File \"/home/inflaton/code/projects/courses/LLaMA-Factory/src/llamafactory/train/tuner.py\", line 50, in run_exp\n", + " run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)\n", + " File \"/home/inflaton/code/projects/courses/LLaMA-Factory/src/llamafactory/train/sft/workflow.py\", line 88, in run_sft\n", + " train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/transformers/trainer.py\", line 1932, in train\n", + " return inner_training_loop(\n", + " ^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/transformers/trainer.py\", line 2268, in _inner_training_loop\n", + " tr_loss_step = self.training_step(model, inputs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/transformers/trainer.py\", line 3307, in training_step\n", + " loss = self.compute_loss(model, inputs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/transformers/trainer.py\", line 3338, in compute_loss\n", + " outputs = model(**inputs)\n", + " ^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1532, in _wrapped_call_impl\n", + " return self._call_impl(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1541, in _call_impl\n", + " return forward_call(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/accelerate/utils/operations.py\", line 822, in forward\n", + " return model_forward(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/accelerate/utils/operations.py\", line 810, in __call__\n", + " return convert_to_fp32(self.model_forward(*args, **kwargs))\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/amp/autocast_mode.py\", line 16, in decorate_autocast\n", + " return func(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/peft/peft_model.py\", line 1430, in forward\n", + " return self.base_model(\n", + " ^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1532, in _wrapped_call_impl\n", + " return self._call_impl(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1541, in _call_impl\n", + " return forward_call(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/peft/tuners/tuners_utils.py\", line 179, in forward\n", + " return self.model.forward(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/accelerate/hooks.py\", line 166, in new_forward\n", + " output = module._old_forward(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/.cache/huggingface/modules/transformers_modules/internlm/internlm2_5-7b-chat-1m/8d1a709a04d71440ef3df6ebbe204672f411c8b6/modeling_internlm2.py\", line 1204, in forward\n", + " outputs = self.model(\n", + " ^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1532, in _wrapped_call_impl\n", + " return self._call_impl(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1541, in _call_impl\n", + " return forward_call(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/accelerate/hooks.py\", line 166, in new_forward\n", + " output = module._old_forward(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/.cache/huggingface/modules/transformers_modules/internlm/internlm2_5-7b-chat-1m/8d1a709a04d71440ef3df6ebbe204672f411c8b6/modeling_internlm2.py\", line 993, in forward\n", + " layer_outputs = self._gradient_checkpointing_func(\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/code/projects/courses/LLaMA-Factory/src/llamafactory/model/model_utils/checkpointing.py\", line 65, in custom_gradient_checkpointing_func\n", + " return gradient_checkpointing_func(func, *args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/_compile.py\", line 24, in inner\n", + " return torch._dynamo.disable(fn, recursive)(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 451, in _fn\n", + " return fn(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/_dynamo/external_utils.py\", line 36, in inner\n", + " return fn(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/utils/checkpoint.py\", line 487, in checkpoint\n", + " return CheckpointFunction.apply(function, preserve, *args)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/autograd/function.py\", line 598, in apply\n", + " return super().apply(*args, **kwargs) # type: ignore[misc]\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/utils/checkpoint.py\", line 262, in forward\n", + " outputs = run_function(*args)\n", + " ^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1532, in _wrapped_call_impl\n", + " return self._call_impl(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1541, in _call_impl\n", + " return forward_call(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/accelerate/hooks.py\", line 166, in new_forward\n", + " output = module._old_forward(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/.cache/huggingface/modules/transformers_modules/internlm/internlm2_5-7b-chat-1m/8d1a709a04d71440ef3df6ebbe204672f411c8b6/modeling_internlm2.py\", line 752, in forward\n", + " hidden_states = self.feed_forward(hidden_states)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1532, in _wrapped_call_impl\n", + " return self._call_impl(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1541, in _call_impl\n", + " return forward_call(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/accelerate/hooks.py\", line 166, in new_forward\n", + " output = module._old_forward(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/.cache/huggingface/modules/transformers_modules/internlm/internlm2_5-7b-chat-1m/8d1a709a04d71440ef3df6ebbe204672f411c8b6/modeling_internlm2.py\", line 206, in forward\n", + " down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1532, in _wrapped_call_impl\n", + " return self._call_impl(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1541, in _call_impl\n", + " return forward_call(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/peft/tuners/lora/bnb.py\", line 460, in forward\n", + " for active_adapter in self.active_adapters:\n", + " ^^^^^^^^^^^^^^^^^^^^\n", + " File \"/home/inflaton/miniconda3/envs/llama-factory/lib/python3.11/site-packages/peft/tuners/tuners_utils.py\", line 528, in active_adapters\n", + " @property\n", + "\n", + "KeyboardInterrupt\n", + "CPU times: user 4min 41s, sys: 1min 43s, total: 6min 24s\n", + "Wall time: 3h 16min\n" + ] + } + ], "source": [ "%%time\n", "\n",