{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "0ea8b46b-839b-445b-8043-ccdf4e920ace", "showTitle": false, "title": "" }, "id": "YLH80COBzi_F" }, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "63B5exAuzq4M" }, "outputs": [], "source": [ "from pathlib import Path\n", "\n", "if \"workding_dir\" not in locals():\n", " try:\n", " from google.colab import drive\n", " drive.mount('/content/drive')\n", " workding_dir = \"/content/drive/MyDrive/logical-reasoning/\"\n", " except ModuleNotFoundError:\n", " workding_dir = str(Path.cwd().parent)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "executionInfo": { "elapsed": 368, "status": "ok", "timestamp": 1719461634865, "user": { "displayName": "Donghao Huang", "userId": "00463591218503521679" }, "user_tz": -480 }, "id": "zFulf0bg0H-9", "outputId": "debdd535-c828-40b9-efc0-8a180e5830dd" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "workding dir: /home/inflaton/code/logical-reasoning\n" ] } ], "source": [ "import os\n", "import sys\n", "\n", "os.chdir(workding_dir)\n", "sys.path.append(workding_dir)\n", "print(\"workding dir:\", workding_dir)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, "inputWidgets": {}, "nuid": "9f67ec60-2f24-411c-84eb-0dd664b44775", "showTitle": false, "title": "" }, "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 589, "status": "ok", "timestamp": 1719462011879, "user": { "displayName": "Donghao Huang", "userId": "00463591218503521679" }, "user_tz": -480 }, "id": "DIUiweYYzi_I", "outputId": "e16e9247-9077-4b0c-f8ea-17059f05a1c4" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Current Directory:\n", "/home/inflaton/code/logical-reasoning\n", "Sat Sep 21 15:45:11 2024 \n", "+-----------------------------------------------------------------------------------------+\n", "| NVIDIA-SMI 560.35.02 Driver Version: 560.94 CUDA Version: 12.6 |\n", "|-----------------------------------------+------------------------+----------------------+\n", "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|=========================================+========================+======================|\n", "| 0 NVIDIA GeForce RTX 4090 On | 00000000:01:00.0 On | Off |\n", "| 37% 54C P8 22W / 450W | 535MiB / 24564MiB | 4% Default |\n", "| | | N/A |\n", "+-----------------------------------------+------------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=========================================================================================|\n", "| 0 N/A N/A 25 G /Xwayland N/A |\n", "+-----------------------------------------------------------------------------------------+\n", "Linux Gen-AI 5.15.133.1-microsoft-standard-WSL2 #1 SMP Thu Oct 5 21:02:42 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux\n", "PRETTY_NAME=\"Ubuntu 22.04.2 LTS\"\n", "NAME=\"Ubuntu\"\n", "VERSION_ID=\"22.04\"\n", "VERSION=\"22.04.2 LTS (Jammy Jellyfish)\"\n", "VERSION_CODENAME=jammy\n", "ID=ubuntu\n", "ID_LIKE=debian\n", "HOME_URL=\"https://www.ubuntu.com/\"\n", "SUPPORT_URL=\"https://help.ubuntu.com/\"\n", "BUG_REPORT_URL=\"https://bugs.launchpad.net/ubuntu/\"\n", "PRIVACY_POLICY_URL=\"https://www.ubuntu.com/legal/terms-and-policies/privacy-policy\"\n", "UBUNTU_CODENAME=jammy\n", "Architecture: x86_64\n", " CPU op-mode(s): 32-bit, 64-bit\n", " Address sizes: 39 bits physical, 48 bits virtual\n", " Byte Order: Little Endian\n", "CPU(s): 32\n", " On-line CPU(s) list: 0-31\n", "Vendor ID: GenuineIntel\n", " Model name: 13th Gen Intel(R) Core(TM) i9-13900KF\n", " CPU family: 6\n", " Model: 183\n", " Thread(s) per core: 2\n", " Core(s) per socket: 16\n", " Socket(s): 1\n", " Stepping: 1\n", " BogoMIPS: 5990.39\n", " Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mc\n", " a cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscal\n", " l nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopo\n", " logy tsc_reliable nonstop_tsc cpuid pni pclmulqdq vmx s\n", " sse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt tsc_dea\n", " dline_timer aes xsave avx f16c rdrand hypervisor lahf_l\n", " m abm 3dnowprefetch ssbd ibrs ibpb stibp ibrs_enhanced \n", " tpr_shadow vnmi ept vpid ept_ad fsgsbase tsc_adjust bmi\n", " 1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushop\n", " t clwb sha_ni xsaveopt xsavec xgetbv1 xsaves avx_vnni u\n", " mip waitpkg gfni vaes vpclmulqdq rdpid movdiri movdir64\n", " b fsrm md_clear serialize flush_l1d arch_capabilities\n", "Virtualization features: \n", " Virtualization: VT-x\n", " Hypervisor vendor: Microsoft\n", " Virtualization type: full\n", "Caches (sum of all): \n", " L1d: 768 KiB (16 instances)\n", " L1i: 512 KiB (16 instances)\n", " L2: 32 MiB (16 instances)\n", " L3: 36 MiB (1 instance)\n", "Vulnerabilities: \n", " Gather data sampling: Not affected\n", " Itlb multihit: Not affected\n", " L1tf: Not affected\n", " Mds: Not affected\n", " Meltdown: Not affected\n", " Mmio stale data: Not affected\n", " Retbleed: Mitigation; Enhanced IBRS\n", " Spec rstack overflow: Not affected\n", " Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\n", " and seccomp\n", " Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer\n", " sanitization\n", " Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB fillin\n", " g, PBRSB-eIBRS SW sequence\n", " Srbds: Not affected\n", " Tsx async abort: Not affected\n", "MemTotal: 49330024 kB\n", "Current Directory:\n", "/home/inflaton/code/logical-reasoning/llama-factory\n", "loading env vars from: /home/inflaton/code/logical-reasoning/.env\n", "Adding /home/inflaton/code/logical-reasoning to sys.path\n", "loading /home/inflaton/code/logical-reasoning/llm_toolkit/logical_reasoning_utils.py\n", "Qwen Qwen2.5-3B-Instruct qwen config/mgtv_template.yaml ../datasets/mgtv\n", "Writing to config/models/Qwen2.5-3B-Instruct.yaml\n", "config/models/Qwen2.5-3B-Instruct.yaml:\n", " {\n", " \"model_name_or_path\": \"Qwen/Qwen2.5-3B-Instruct\",\n", " \"stage\": \"sft\",\n", " \"do_train\": true,\n", " \"finetuning_type\": \"lora\",\n", " \"lora_target\": \"all\",\n", " \"dataset\": \"alpaca_mgtv_p2\",\n", " \"template\": \"qwen\",\n", " \"cutoff_len\": 8192,\n", " \"max_samples\": 25000,\n", " \"overwrite_cache\": true,\n", " \"preprocessing_num_workers\": 16,\n", " \"output_dir\": \"saves/Qwen2.5-3B-Instruct\",\n", " \"logging_steps\": 5,\n", " \"save_steps\": 35,\n", " \"plot_loss\": true,\n", " \"per_device_train_batch_size\": 16,\n", " \"gradient_accumulation_steps\": 8,\n", " \"learning_rate\": 0.0001,\n", " \"num_train_epochs\": 2.0,\n", " \"lr_scheduler_type\": \"cosine\",\n", " \"warmup_ratio\": 0.1,\n", " \"bf16\": true,\n", " \"ddp_timeout\": 180000000,\n", " \"val_size\": 0.1,\n", " \"per_device_eval_batch_size\": 1,\n", " \"eval_strategy\": \"steps\",\n", " \"eval_steps\": 35,\n", " \"report_to\": \"none\",\n", " \"run_name\": \"Qwen2.5-3B-Instruct_lora_sft\"\n", "}\n", "09/21/2024 15:45:22 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16\n", "[INFO|configuration_utils.py:733] 2024-09-21 15:45:23,254 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/82f42baa094a9600e39ccd80d34058aeeb3abbc1/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 15:45:23,255 >> Model config Qwen2Config {\n", " \"_name_or_path\": \"Qwen/Qwen2.5-3B-Instruct\",\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 2048,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 11008,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 70,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 16,\n", " \"num_hidden_layers\": 36,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:45:23,514 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/82f42baa094a9600e39ccd80d34058aeeb3abbc1/vocab.json\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:45:23,514 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/82f42baa094a9600e39ccd80d34058aeeb3abbc1/merges.txt\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:45:23,514 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/82f42baa094a9600e39ccd80d34058aeeb3abbc1/tokenizer.json\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:45:23,514 >> loading file added_tokens.json from cache at None\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:45:23,514 >> loading file special_tokens_map.json from cache at None\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:45:23,514 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/82f42baa094a9600e39ccd80d34058aeeb3abbc1/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2533] 2024-09-21 15:45:23,600 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "[INFO|configuration_utils.py:733] 2024-09-21 15:45:24,723 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/82f42baa094a9600e39ccd80d34058aeeb3abbc1/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 15:45:24,724 >> Model config Qwen2Config {\n", " \"_name_or_path\": \"Qwen/Qwen2.5-3B-Instruct\",\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 2048,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 11008,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 70,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 16,\n", " \"num_hidden_layers\": 36,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:45:24,997 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/82f42baa094a9600e39ccd80d34058aeeb3abbc1/vocab.json\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:45:24,997 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/82f42baa094a9600e39ccd80d34058aeeb3abbc1/merges.txt\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:45:24,997 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/82f42baa094a9600e39ccd80d34058aeeb3abbc1/tokenizer.json\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:45:24,997 >> loading file added_tokens.json from cache at None\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:45:24,997 >> loading file special_tokens_map.json from cache at None\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:45:24,997 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/82f42baa094a9600e39ccd80d34058aeeb3abbc1/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2533] 2024-09-21 15:45:25,089 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "09/21/2024 15:45:25 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>\n", "09/21/2024 15:45:25 - INFO - llamafactory.data.loader - Loading dataset alpaca_mgtv_p2.json...\n", "Converting format of dataset (num_proc=16): 100%|█| 25000/25000 [00:00<00:00, 11\n", "Running tokenizer on dataset (num_proc=16): 100%|█| 25000/25000 [00:01<00:00, 16\n", "training example:\n", "input_ids:\n", "[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 56568, 101909, 108024, 101497, 107969, 99329, 9370, 106040, 1773, 99329, 104190, 104506, 48443, 16, 13, 26853, 224, 57218, 28946, 36993, 101051, 46944, 107969, 27091, 3837, 107969, 27091, 36993, 53481, 46944, 100405, 99518, 104151, 101128, 9370, 57621, 8997, 17, 13, 89982, 68878, 17340, 99392, 107969, 99413, 3837, 107969, 99413, 20412, 107969, 27091, 111230, 8997, 18, 13, 26853, 224, 57218, 28946, 73670, 105396, 99885, 106386, 28330, 86119, 112469, 100246, 57621, 9370, 106538, 8997, 19, 13, 69162, 34204, 103991, 86119, 3837, 106040, 44063, 100345, 107591, 102104, 87752, 105220, 109487, 100653, 5122, 20412, 5373, 99520, 5373, 16530, 99335, 5373, 102104, 88991, 5373, 56007, 24339, 32100, 1773, 99200, 102104, 9370, 104317, 100142, 104506, 28311, 256, 481, 92498, 107969, 27091, 33108, 107969, 99413, 114562, 86119, 111230, 3837, 102104, 5122, 20412, 100631, 99520, 198, 256, 481, 92498, 107969, 27091, 33108, 107969, 99413, 53153, 101041, 100631, 108349, 83751, 63789, 20221, 86119, 111230, 3837, 102104, 5122, 16530, 99335, 198, 256, 481, 92498, 111842, 107666, 113479, 106386, 28330, 86119, 100631, 86119, 104151, 101128, 3837, 102104, 5122, 56007, 24339, 32100, 198, 256, 481, 92498, 111842, 107666, 99797, 108670, 34187, 107969, 99413, 106538, 3837, 102104, 5122, 102104, 88991, 198, 20, 13, 49602, 252, 99590, 15946, 53153, 42855, 99885, 102158, 27369, 3837, 105827, 65770, 99475, 109487, 101047, 110281, 18600, 1773, 77557, 3837, 108620, 99360, 2073, 99520, 854, 65770, 99475, 12857, 2073, 16530, 96332, 14880, 110439, 100001, 104190, 102104, 111842, 101080, 103936, 3407, 334, 107969, 27091, 66963, 73562, 109628, 45629, 105489, 3837, 104133, 111718, 106023, 5122, 101988, 115865, 110731, 9370, 105419, 3837, 115865, 99810, 69249, 59743, 104133, 104003, 115865, 36993, 16530, 101401, 68536, 99723, 3837, 115967, 104270, 102060, 110666, 112031, 1773, 14880, 109363, 115865, 110786, 101423, 104249, 3407, 334, 107969, 99413, 66963, 10236, 250, 253, 48921, 101221, 57218, 101961, 7948, 100894, 9370, 99288, 99818, 101063, 1773, 104269, 99288, 99818, 100774, 13343, 3837, 99798, 57218, 101961, 105664, 102373, 48921, 100271, 1773, 99650, 105616, 18493, 115865, 110731, 9370, 105419, 104388, 1773, 103968, 3837, 102606, 102115, 17340, 3837, 102373, 18493, 106340, 24562, 99774, 82224, 104424, 15946, 99372, 99244, 1773, 110597, 9370, 99288, 99818, 100012, 101416, 63109, 99242, 9370, 102373, 3837, 101988, 101938, 44063, 104003, 115865, 101329, 99314, 3837, 107974, 102373, 9370, 104575, 24562, 3837, 105699, 116418, 100005, 103000, 90663, 1773, 100147, 101070, 105443, 34187, 100097, 3837, 104989, 100833, 69249, 46944, 105190, 9370, 106023, 3407, 334, 111842, 101080, 103936, 66963, 4891, 223, 115, 100623, 21317, 99315, 101037, 198, 151645, 198, 151644, 77091, 198, 99520, 151645]\n", "inputs:\n", "<|im_start|>system\n", "You are a helpful assistant.<|im_end|>\n", "<|im_start|>user\n", "你是一个情景猜谜游戏的主持人。游戏规则如下:\n", "\n", "1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\n", "2. 主持人知道谜底,谜底是谜面的答案。\n", "3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n", "4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\n", " - 若谜面和谜底能找到问题的答案,回答:是或者不是\n", " - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\n", " - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\n", " - 若参与者提问基本还原了谜底真相,回答:回答正确\n", "5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\n", "\n", "请严格按照这些规则回答参与者提出的问题。\n", "\n", "**谜面:** 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n", "\n", "**谜底:** 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n", "\n", "**参与者提出的问题:** 偷的人信神吗\n", "<|im_end|>\n", "<|im_start|>assistant\n", "不是<|im_end|>\n", "label_ids:\n", "[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 99520, 151645]\n", "labels:\n", "不是<|im_end|>\n", "[INFO|configuration_utils.py:733] 2024-09-21 15:45:28,548 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/82f42baa094a9600e39ccd80d34058aeeb3abbc1/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 15:45:28,548 >> Model config Qwen2Config {\n", " \"_name_or_path\": \"Qwen/Qwen2.5-3B-Instruct\",\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 2048,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 11008,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 70,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 16,\n", " \"num_hidden_layers\": 36,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|modeling_utils.py:3634] 2024-09-21 15:45:28,559 >> loading weights file model.safetensors from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/82f42baa094a9600e39ccd80d34058aeeb3abbc1/model.safetensors.index.json\n", "[INFO|modeling_utils.py:1572] 2024-09-21 15:45:28,560 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.\n", "[INFO|configuration_utils.py:1038] 2024-09-21 15:45:28,560 >> Generate config GenerationConfig {\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645\n", "}\n", "\n", "Loading checkpoint shards: 100%|██████████████████| 2/2 [00:39<00:00, 19.96s/it]\n", "[INFO|modeling_utils.py:4463] 2024-09-21 15:46:08,667 >> All model checkpoint weights were used when initializing Qwen2ForCausalLM.\n", "\n", "[INFO|modeling_utils.py:4471] 2024-09-21 15:46:08,667 >> All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at Qwen/Qwen2.5-3B-Instruct.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.\n", "[INFO|configuration_utils.py:993] 2024-09-21 15:46:08,922 >> loading configuration file generation_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/82f42baa094a9600e39ccd80d34058aeeb3abbc1/generation_config.json\n", "[INFO|configuration_utils.py:1038] 2024-09-21 15:46:08,922 >> Generate config GenerationConfig {\n", " \"bos_token_id\": 151643,\n", " \"do_sample\": true,\n", " \"eos_token_id\": [\n", " 151645,\n", " 151643\n", " ],\n", " \"pad_token_id\": 151643,\n", " \"repetition_penalty\": 1.05,\n", " \"temperature\": 0.7,\n", " \"top_k\": 20,\n", " \"top_p\": 0.8\n", "}\n", "\n", "09/21/2024 15:46:09 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.\n", "09/21/2024 15:46:09 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.\n", "09/21/2024 15:46:09 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.\n", "09/21/2024 15:46:09 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA\n", "09/21/2024 15:46:09 - INFO - llamafactory.model.model_utils.misc - Found linear modules: v_proj,q_proj,k_proj,up_proj,gate_proj,down_proj,o_proj\n", "09/21/2024 15:46:09 - INFO - llamafactory.model.loader - trainable params: 14,966,784 || all params: 3,100,905,472 || trainable%: 0.4827\n", "[INFO|trainer.py:648] 2024-09-21 15:46:09,435 >> Using auto half precision backend\n", "[INFO|trainer.py:2134] 2024-09-21 15:46:09,741 >> ***** Running training *****\n", "[INFO|trainer.py:2135] 2024-09-21 15:46:09,741 >> Num examples = 22,500\n", "[INFO|trainer.py:2136] 2024-09-21 15:46:09,741 >> Num Epochs = 2\n", "[INFO|trainer.py:2137] 2024-09-21 15:46:09,741 >> Instantaneous batch size per device = 16\n", "[INFO|trainer.py:2140] 2024-09-21 15:46:09,741 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", "[INFO|trainer.py:2141] 2024-09-21 15:46:09,741 >> Gradient Accumulation steps = 8\n", "[INFO|trainer.py:2142] 2024-09-21 15:46:09,741 >> Total optimization steps = 350\n", "[INFO|trainer.py:2143] 2024-09-21 15:46:09,743 >> Number of trainable parameters = 14,966,784\n", " 0%| | 0/350 [00:00\n", " sys.exit(main())\n", " ^^^^^^\n", " File \"/home/inflaton/code/LLaMA-Factory/src/llamafactory/cli.py\", line 111, in main\n", " run_exp()\n", " File \"/home/inflaton/code/LLaMA-Factory/src/llamafactory/train/tuner.py\", line 50, in run_exp\n", " run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)\n", " File \"/home/inflaton/code/LLaMA-Factory/src/llamafactory/train/sft/workflow.py\", line 96, in run_sft\n", " train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/transformers/trainer.py\", line 1938, in train\n", " return inner_training_loop(\n", " ^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/transformers/trainer.py\", line 2279, in _inner_training_loop\n", " tr_loss_step = self.training_step(model, inputs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/transformers/trainer.py\", line 3318, in training_step\n", " loss = self.compute_loss(model, inputs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/transformers/trainer.py\", line 3363, in compute_loss\n", " outputs = model(**inputs)\n", " ^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1532, in _wrapped_call_impl\n", " return self._call_impl(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1541, in _call_impl\n", " return forward_call(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/accelerate/utils/operations.py\", line 819, in forward\n", " return model_forward(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/accelerate/utils/operations.py\", line 807, in __call__\n", " return convert_to_fp32(self.model_forward(*args, **kwargs))\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/torch/amp/autocast_mode.py\", line 16, in decorate_autocast\n", " return func(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/peft/peft_model.py\", line 1430, in forward\n", " return self.base_model(\n", " ^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1532, in _wrapped_call_impl\n", " return self._call_impl(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1541, in _call_impl\n", " return forward_call(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/peft/tuners/tuners_utils.py\", line 179, in forward\n", " return self.model.forward(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/transformers/models/qwen2/modeling_qwen2.py\", line 1082, in forward\n", " loss = loss_fct(shift_logits, shift_labels)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1532, in _wrapped_call_impl\n", " return self._call_impl(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1541, in _call_impl\n", " return forward_call(*args, **kwargs)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/torch/nn/modules/loss.py\", line 1185, in forward\n", " return F.cross_entropy(input, target, weight=self.weight,\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", " File \"/home/inflaton/miniconda3/envs/llm-finetuning/lib/python3.11/site-packages/torch/nn/functional.py\", line 3086, in cross_entropy\n", " return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)\n", " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", "RuntimeError: CUDA driver error: out of memory\n", " 0%| | 0/350 [00:05> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 15:46:21,868 >> Model config Qwen2Config {\n", " \"_name_or_path\": \"Qwen/Qwen2.5-1.5B-Instruct\",\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:46:22,126 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/vocab.json\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:46:22,127 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/merges.txt\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:46:22,127 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/tokenizer.json\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:46:22,127 >> loading file added_tokens.json from cache at None\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:46:22,127 >> loading file special_tokens_map.json from cache at None\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:46:22,127 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2533] 2024-09-21 15:46:22,222 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "[INFO|configuration_utils.py:733] 2024-09-21 15:46:23,467 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 15:46:23,468 >> Model config Qwen2Config {\n", " \"_name_or_path\": \"Qwen/Qwen2.5-1.5B-Instruct\",\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:46:23,726 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/vocab.json\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:46:23,726 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/merges.txt\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:46:23,726 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/tokenizer.json\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:46:23,726 >> loading file added_tokens.json from cache at None\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:46:23,726 >> loading file special_tokens_map.json from cache at None\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 15:46:23,726 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2533] 2024-09-21 15:46:23,811 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "09/21/2024 15:46:23 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>\n", "09/21/2024 15:46:23 - INFO - llamafactory.data.loader - Loading dataset alpaca_mgtv_p2.json...\n", "Converting format of dataset (num_proc=16): 100%|█| 25000/25000 [00:00<00:00, 11\n", "Running tokenizer on dataset (num_proc=16): 100%|█| 25000/25000 [00:01<00:00, 16\n", "training example:\n", "input_ids:\n", "[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 56568, 101909, 108024, 101497, 107969, 99329, 9370, 106040, 1773, 99329, 104190, 104506, 48443, 16, 13, 26853, 224, 57218, 28946, 36993, 101051, 46944, 107969, 27091, 3837, 107969, 27091, 36993, 53481, 46944, 100405, 99518, 104151, 101128, 9370, 57621, 8997, 17, 13, 89982, 68878, 17340, 99392, 107969, 99413, 3837, 107969, 99413, 20412, 107969, 27091, 111230, 8997, 18, 13, 26853, 224, 57218, 28946, 73670, 105396, 99885, 106386, 28330, 86119, 112469, 100246, 57621, 9370, 106538, 8997, 19, 13, 69162, 34204, 103991, 86119, 3837, 106040, 44063, 100345, 107591, 102104, 87752, 105220, 109487, 100653, 5122, 20412, 5373, 99520, 5373, 16530, 99335, 5373, 102104, 88991, 5373, 56007, 24339, 32100, 1773, 99200, 102104, 9370, 104317, 100142, 104506, 28311, 256, 481, 92498, 107969, 27091, 33108, 107969, 99413, 114562, 86119, 111230, 3837, 102104, 5122, 20412, 100631, 99520, 198, 256, 481, 92498, 107969, 27091, 33108, 107969, 99413, 53153, 101041, 100631, 108349, 83751, 63789, 20221, 86119, 111230, 3837, 102104, 5122, 16530, 99335, 198, 256, 481, 92498, 111842, 107666, 113479, 106386, 28330, 86119, 100631, 86119, 104151, 101128, 3837, 102104, 5122, 56007, 24339, 32100, 198, 256, 481, 92498, 111842, 107666, 99797, 108670, 34187, 107969, 99413, 106538, 3837, 102104, 5122, 102104, 88991, 198, 20, 13, 49602, 252, 99590, 15946, 53153, 42855, 99885, 102158, 27369, 3837, 105827, 65770, 99475, 109487, 101047, 110281, 18600, 1773, 77557, 3837, 108620, 99360, 2073, 99520, 854, 65770, 99475, 12857, 2073, 16530, 96332, 14880, 110439, 100001, 104190, 102104, 111842, 101080, 103936, 3407, 334, 107969, 27091, 66963, 73562, 109628, 45629, 105489, 3837, 104133, 111718, 106023, 5122, 101988, 115865, 110731, 9370, 105419, 3837, 115865, 99810, 69249, 59743, 104133, 104003, 115865, 36993, 16530, 101401, 68536, 99723, 3837, 115967, 104270, 102060, 110666, 112031, 1773, 14880, 109363, 115865, 110786, 101423, 104249, 3407, 334, 107969, 99413, 66963, 10236, 250, 253, 48921, 101221, 57218, 101961, 7948, 100894, 9370, 99288, 99818, 101063, 1773, 104269, 99288, 99818, 100774, 13343, 3837, 99798, 57218, 101961, 105664, 102373, 48921, 100271, 1773, 99650, 105616, 18493, 115865, 110731, 9370, 105419, 104388, 1773, 103968, 3837, 102606, 102115, 17340, 3837, 102373, 18493, 106340, 24562, 99774, 82224, 104424, 15946, 99372, 99244, 1773, 110597, 9370, 99288, 99818, 100012, 101416, 63109, 99242, 9370, 102373, 3837, 101988, 101938, 44063, 104003, 115865, 101329, 99314, 3837, 107974, 102373, 9370, 104575, 24562, 3837, 105699, 116418, 100005, 103000, 90663, 1773, 100147, 101070, 105443, 34187, 100097, 3837, 104989, 100833, 69249, 46944, 105190, 9370, 106023, 3407, 334, 111842, 101080, 103936, 66963, 4891, 223, 115, 100623, 21317, 99315, 101037, 198, 151645, 198, 151644, 77091, 198, 99520, 151645]\n", "inputs:\n", "<|im_start|>system\n", "You are a helpful assistant.<|im_end|>\n", "<|im_start|>user\n", "你是一个情景猜谜游戏的主持人。游戏规则如下:\n", "\n", "1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\n", "2. 主持人知道谜底,谜底是谜面的答案。\n", "3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n", "4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\n", " - 若谜面和谜底能找到问题的答案,回答:是或者不是\n", " - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\n", " - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\n", " - 若参与者提问基本还原了谜底真相,回答:回答正确\n", "5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\n", "\n", "请严格按照这些规则回答参与者提出的问题。\n", "\n", "**谜面:** 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n", "\n", "**谜底:** 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n", "\n", "**参与者提出的问题:** 偷的人信神吗\n", "<|im_end|>\n", "<|im_start|>assistant\n", "不是<|im_end|>\n", "label_ids:\n", "[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 99520, 151645]\n", "labels:\n", "不是<|im_end|>\n", "[INFO|configuration_utils.py:733] 2024-09-21 15:46:27,182 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 15:46:27,182 >> Model config Qwen2Config {\n", " \"_name_or_path\": \"Qwen/Qwen2.5-1.5B-Instruct\",\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|modeling_utils.py:3634] 2024-09-21 15:46:27,194 >> loading weights file model.safetensors from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/model.safetensors\n", "[INFO|modeling_utils.py:1572] 2024-09-21 15:46:27,198 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.\n", "[INFO|configuration_utils.py:1038] 2024-09-21 15:46:27,199 >> Generate config GenerationConfig {\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645\n", "}\n", "\n", "[INFO|modeling_utils.py:4463] 2024-09-21 15:46:57,790 >> All model checkpoint weights were used when initializing Qwen2ForCausalLM.\n", "\n", "[INFO|modeling_utils.py:4471] 2024-09-21 15:46:57,790 >> All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at Qwen/Qwen2.5-1.5B-Instruct.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.\n", "[INFO|configuration_utils.py:993] 2024-09-21 15:46:58,065 >> loading configuration file generation_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/generation_config.json\n", "[INFO|configuration_utils.py:1038] 2024-09-21 15:46:58,066 >> Generate config GenerationConfig {\n", " \"bos_token_id\": 151643,\n", " \"do_sample\": true,\n", " \"eos_token_id\": [\n", " 151645,\n", " 151643\n", " ],\n", " \"pad_token_id\": 151643,\n", " \"repetition_penalty\": 1.1,\n", " \"temperature\": 0.7,\n", " \"top_k\": 20,\n", " \"top_p\": 0.8\n", "}\n", "\n", "09/21/2024 15:46:58 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.\n", "09/21/2024 15:46:58 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.\n", "09/21/2024 15:46:58 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.\n", "09/21/2024 15:46:58 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA\n", "09/21/2024 15:46:58 - INFO - llamafactory.model.model_utils.misc - Found linear modules: up_proj,k_proj,o_proj,down_proj,gate_proj,v_proj,q_proj\n", "09/21/2024 15:46:58 - INFO - llamafactory.model.loader - trainable params: 9,232,384 || all params: 1,552,946,688 || trainable%: 0.5945\n", "[INFO|trainer.py:648] 2024-09-21 15:46:58,533 >> Using auto half precision backend\n", "[INFO|trainer.py:2134] 2024-09-21 15:46:58,837 >> ***** Running training *****\n", "[INFO|trainer.py:2135] 2024-09-21 15:46:58,837 >> Num examples = 22,500\n", "[INFO|trainer.py:2136] 2024-09-21 15:46:58,837 >> Num Epochs = 2\n", "[INFO|trainer.py:2137] 2024-09-21 15:46:58,837 >> Instantaneous batch size per device = 16\n", "[INFO|trainer.py:2140] 2024-09-21 15:46:58,837 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", "[INFO|trainer.py:2141] 2024-09-21 15:46:58,837 >> Gradient Accumulation steps = 8\n", "[INFO|trainer.py:2142] 2024-09-21 15:46:58,837 >> Total optimization steps = 350\n", "[INFO|trainer.py:2143] 2024-09-21 15:46:58,839 >> Number of trainable parameters = 9,232,384\n", "{'loss': 0.8917, 'grad_norm': 6.9668869972229, 'learning_rate': 1.4285714285714285e-05, 'epoch': 0.03}\n", "{'loss': 0.7893, 'grad_norm': 4.941070079803467, 'learning_rate': 2.857142857142857e-05, 'epoch': 0.06}\n", "{'loss': 0.5842, 'grad_norm': 2.900670051574707, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.09}\n", "{'loss': 0.5452, 'grad_norm': 1.3792134523391724, 'learning_rate': 5.714285714285714e-05, 'epoch': 0.11}\n", "{'loss': 0.5008, 'grad_norm': 2.0141210556030273, 'learning_rate': 7.142857142857143e-05, 'epoch': 0.14}\n", "{'loss': 0.4601, 'grad_norm': 1.0095895528793335, 'learning_rate': 8.571428571428571e-05, 'epoch': 0.17}\n", "{'loss': 0.4191, 'grad_norm': 1.0098716020584106, 'learning_rate': 0.0001, 'epoch': 0.2}\n", " 10%|████▏ | 35/350 [05:32<49:55, 9.51s/it][INFO|trainer.py:3819] 2024-09-21 15:52:31,446 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 15:52:31,446 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 15:52:31,446 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-1.5B-Instruct/checkpoint-35\n", "[INFO|configuration_utils.py:733] 2024-09-21 15:54:02,730 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 15:54:02,731 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 15:54:02,777 >> tokenizer config file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-35/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 15:54:02,777 >> Special tokens file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-35/special_tokens_map.json\n", "{'loss': 0.3736, 'grad_norm': 0.5351007580757141, 'learning_rate': 9.993784606094612e-05, 'epoch': 0.23}\n", "{'loss': 0.3715, 'grad_norm': 0.634586751461029, 'learning_rate': 9.975153876827008e-05, 'epoch': 0.26}\n", "{'loss': 0.372, 'grad_norm': 0.5977622270584106, 'learning_rate': 9.944154131125642e-05, 'epoch': 0.28}\n", "{'loss': 0.3397, 'grad_norm': 0.709690272808075, 'learning_rate': 9.900862439242719e-05, 'epoch': 0.31}\n", "{'loss': 0.3459, 'grad_norm': 0.4649967849254608, 'learning_rate': 9.84538643114539e-05, 'epoch': 0.34}\n", "{'loss': 0.3089, 'grad_norm': 0.6929703950881958, 'learning_rate': 9.777864028930705e-05, 'epoch': 0.37}\n", "{'loss': 0.3251, 'grad_norm': 0.762086033821106, 'learning_rate': 9.698463103929542e-05, 'epoch': 0.4}\n", " 20%|████████▍ | 70/350 [12:40<44:20, 9.50s/it][INFO|trainer.py:3819] 2024-09-21 15:59:39,607 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 15:59:39,608 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 15:59:39,608 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-1.5B-Instruct/checkpoint-70\n", "[INFO|configuration_utils.py:733] 2024-09-21 16:01:11,066 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 16:01:11,067 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 16:01:11,111 >> tokenizer config file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-70/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 16:01:11,111 >> Special tokens file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-70/special_tokens_map.json\n", "{'loss': 0.3082, 'grad_norm': 0.8383176922798157, 'learning_rate': 9.607381059352038e-05, 'epoch': 0.43}\n", "{'loss': 0.2929, 'grad_norm': 0.8879653215408325, 'learning_rate': 9.504844339512095e-05, 'epoch': 0.45}\n", "{'loss': 0.3087, 'grad_norm': 1.3542834520339966, 'learning_rate': 9.391107866851143e-05, 'epoch': 0.48}\n", "{'loss': 0.2889, 'grad_norm': 0.4832295775413513, 'learning_rate': 9.266454408160779e-05, 'epoch': 0.51}\n", "{'loss': 0.2977, 'grad_norm': 0.7334930896759033, 'learning_rate': 9.131193871579975e-05, 'epoch': 0.54}\n", "{'loss': 0.284, 'grad_norm': 0.9593209624290466, 'learning_rate': 8.985662536114613e-05, 'epoch': 0.57}\n", "{'loss': 0.3081, 'grad_norm': 0.6446382403373718, 'learning_rate': 8.83022221559489e-05, 'epoch': 0.6}\n", " 30%|████████████▎ | 105/350 [19:45<38:35, 9.45s/it][INFO|trainer.py:3819] 2024-09-21 16:06:43,938 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 16:06:43,938 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 16:06:43,938 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-1.5B-Instruct/checkpoint-105\n", "[INFO|configuration_utils.py:733] 2024-09-21 16:08:46,048 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 16:08:46,049 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 16:08:46,094 >> tokenizer config file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-105/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 16:08:46,094 >> Special tokens file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-105/special_tokens_map.json\n", "{'loss': 0.2875, 'grad_norm': 0.5638000965118408, 'learning_rate': 8.665259359149132e-05, 'epoch': 0.63}\n", "{'loss': 0.2773, 'grad_norm': 0.8856341242790222, 'learning_rate': 8.491184090430364e-05, 'epoch': 0.65}\n", "{'loss': 0.2728, 'grad_norm': 0.933649480342865, 'learning_rate': 8.308429187984297e-05, 'epoch': 0.68}\n", "{'loss': 0.2676, 'grad_norm': 0.6383955478668213, 'learning_rate': 8.117449009293668e-05, 'epoch': 0.71}\n", "{'loss': 0.2793, 'grad_norm': 0.6104869246482849, 'learning_rate': 7.91871836117395e-05, 'epoch': 0.74}\n", "{'loss': 0.3009, 'grad_norm': 1.186869502067566, 'learning_rate': 7.712731319328798e-05, 'epoch': 0.77}\n", "{'loss': 0.2825, 'grad_norm': 0.4962313175201416, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.8}\n", " 40%|████████████████▍ | 140/350 [27:21<33:34, 9.59s/it][INFO|trainer.py:3819] 2024-09-21 16:14:20,188 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 16:14:20,188 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 16:14:20,188 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-1.5B-Instruct/checkpoint-140\n", "[INFO|configuration_utils.py:733] 2024-09-21 16:15:54,027 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 16:15:54,027 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 16:15:54,069 >> tokenizer config file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-140/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 16:15:54,069 >> Special tokens file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-140/special_tokens_map.json\n", "{'loss': 0.2769, 'grad_norm': 0.8555291295051575, 'learning_rate': 7.281053286765815e-05, 'epoch': 0.82}\n", "{'loss': 0.2896, 'grad_norm': 0.9149414300918579, 'learning_rate': 7.056435515653059e-05, 'epoch': 0.85}\n", "{'loss': 0.2784, 'grad_norm': 0.47306105494499207, 'learning_rate': 6.826705121831976e-05, 'epoch': 0.88}\n", "{'loss': 0.2722, 'grad_norm': 0.5558005571365356, 'learning_rate': 6.592433251258423e-05, 'epoch': 0.91}\n", "{'loss': 0.2573, 'grad_norm': 1.440822958946228, 'learning_rate': 6.354202340715026e-05, 'epoch': 0.94}\n", "{'loss': 0.2766, 'grad_norm': 0.8847922086715698, 'learning_rate': 6.112604669781572e-05, 'epoch': 0.97}\n", "{'loss': 0.2693, 'grad_norm': 0.6979252099990845, 'learning_rate': 5.868240888334653e-05, 'epoch': 1.0}\n", " 50%|████████████████████▌ | 175/350 [34:28<27:47, 9.53s/it][INFO|trainer.py:3819] 2024-09-21 16:21:27,237 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 16:21:27,237 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 16:21:27,237 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-1.5B-Instruct/checkpoint-175\n", "[INFO|configuration_utils.py:733] 2024-09-21 16:23:01,512 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 16:23:01,513 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 16:23:01,550 >> tokenizer config file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-175/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 16:23:01,550 >> Special tokens file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-175/special_tokens_map.json\n", "{'loss': 0.2619, 'grad_norm': 0.7065874934196472, 'learning_rate': 5.621718523237427e-05, 'epoch': 1.02}\n", "{'loss': 0.2496, 'grad_norm': 0.9854199886322021, 'learning_rate': 5.373650467932122e-05, 'epoch': 1.05}\n", "{'loss': 0.268, 'grad_norm': 1.2961649894714355, 'learning_rate': 5.124653458690365e-05, 'epoch': 1.08}\n", "{'loss': 0.2473, 'grad_norm': 0.6371685862541199, 'learning_rate': 4.875346541309637e-05, 'epoch': 1.11}\n", "{'loss': 0.2649, 'grad_norm': 0.5193257331848145, 'learning_rate': 4.626349532067879e-05, 'epoch': 1.14}\n", "{'loss': 0.2671, 'grad_norm': 0.5210095643997192, 'learning_rate': 4.378281476762576e-05, 'epoch': 1.17}\n", "{'loss': 0.2466, 'grad_norm': 0.6401721239089966, 'learning_rate': 4.131759111665349e-05, 'epoch': 1.19}\n", " 60%|████████████████████████▌ | 210/350 [41:35<22:11, 9.51s/it][INFO|trainer.py:3819] 2024-09-21 16:28:33,947 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 16:28:33,947 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 16:28:33,947 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-1.5B-Instruct/checkpoint-210\n", "[INFO|configuration_utils.py:733] 2024-09-21 16:30:09,340 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 16:30:09,341 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 16:30:09,379 >> tokenizer config file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-210/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 16:30:09,379 >> Special tokens file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-210/special_tokens_map.json\n", "{'loss': 0.2345, 'grad_norm': 0.5983948707580566, 'learning_rate': 3.887395330218429e-05, 'epoch': 1.22}\n", "{'loss': 0.2294, 'grad_norm': 0.8043653964996338, 'learning_rate': 3.6457976592849754e-05, 'epoch': 1.25}\n", "{'loss': 0.2518, 'grad_norm': 0.9972067475318909, 'learning_rate': 3.4075667487415785e-05, 'epoch': 1.28}\n", "{'loss': 0.2492, 'grad_norm': 0.8310278654098511, 'learning_rate': 3.173294878168025e-05, 'epoch': 1.31}\n", "{'loss': 0.2547, 'grad_norm': 0.6404473781585693, 'learning_rate': 2.9435644843469436e-05, 'epoch': 1.34}\n", "{'loss': 0.2495, 'grad_norm': 0.7588335871696472, 'learning_rate': 2.718946713234185e-05, 'epoch': 1.36}\n", "{'loss': 0.2733, 'grad_norm': 0.6872820854187012, 'learning_rate': 2.500000000000001e-05, 'epoch': 1.39}\n", " 70%|████████████████████████████▋ | 245/350 [48:44<16:51, 9.64s/it][INFO|trainer.py:3819] 2024-09-21 16:35:43,271 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 16:35:43,271 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 16:35:43,271 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-1.5B-Instruct/checkpoint-245\n", "[INFO|configuration_utils.py:733] 2024-09-21 16:37:18,128 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 16:37:18,129 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 16:37:18,161 >> tokenizer config file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-245/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 16:37:18,161 >> Special tokens file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-245/special_tokens_map.json\n", "{'loss': 0.2567, 'grad_norm': 0.8496139645576477, 'learning_rate': 2.2872686806712035e-05, 'epoch': 1.42}\n", "{'loss': 0.2311, 'grad_norm': 0.6145327091217041, 'learning_rate': 2.0812816388260518e-05, 'epoch': 1.45}\n", "{'loss': 0.2171, 'grad_norm': 0.6917315721511841, 'learning_rate': 1.8825509907063327e-05, 'epoch': 1.48}\n", "{'loss': 0.2285, 'grad_norm': 0.8362339735031128, 'learning_rate': 1.691570812015704e-05, 'epoch': 1.51}\n", "{'loss': 0.2643, 'grad_norm': 0.8186646699905396, 'learning_rate': 1.5088159095696363e-05, 'epoch': 1.54}\n", "{'loss': 0.2331, 'grad_norm': 0.9536941051483154, 'learning_rate': 1.3347406408508695e-05, 'epoch': 1.56}\n", "{'loss': 0.2396, 'grad_norm': 0.5406892895698547, 'learning_rate': 1.1697777844051105e-05, 'epoch': 1.59}\n", " 80%|████████████████████████████████▊ | 280/350 [55:51<11:04, 9.49s/it][INFO|trainer.py:3819] 2024-09-21 16:42:50,749 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 16:42:50,749 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 16:42:50,749 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-1.5B-Instruct/checkpoint-280\n", "[INFO|configuration_utils.py:733] 2024-09-21 16:44:26,381 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 16:44:26,381 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 16:44:26,417 >> tokenizer config file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-280/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 16:44:26,417 >> Special tokens file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-280/special_tokens_map.json\n", "{'loss': 0.2464, 'grad_norm': 0.9152795076370239, 'learning_rate': 1.0143374638853891e-05, 'epoch': 1.62}\n", "{'loss': 0.2451, 'grad_norm': 0.6611983776092529, 'learning_rate': 8.688061284200266e-06, 'epoch': 1.65}\n", "{'loss': 0.2268, 'grad_norm': 0.6333246827125549, 'learning_rate': 7.33545591839222e-06, 'epoch': 1.68}\n", "{'loss': 0.2407, 'grad_norm': 0.6191487312316895, 'learning_rate': 6.088921331488568e-06, 'epoch': 1.71}\n", "{'loss': 0.2383, 'grad_norm': 0.8804998397827148, 'learning_rate': 4.951556604879048e-06, 'epoch': 1.73}\n", "{'loss': 0.2203, 'grad_norm': 0.7381444573402405, 'learning_rate': 3.9261894064796135e-06, 'epoch': 1.76}\n", "{'loss': 0.2373, 'grad_norm': 0.8267008066177368, 'learning_rate': 3.0153689607045845e-06, 'epoch': 1.79}\n", " 90%|███████████████████████████████████ | 315/350 [1:03:00<05:31, 9.47s/it][INFO|trainer.py:3819] 2024-09-21 16:49:59,378 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 16:49:59,378 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 16:49:59,378 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-1.5B-Instruct/checkpoint-315\n", "[INFO|configuration_utils.py:733] 2024-09-21 16:51:33,921 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 16:51:33,921 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 16:51:33,959 >> tokenizer config file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-315/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 16:51:33,959 >> Special tokens file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-315/special_tokens_map.json\n", "{'loss': 0.2281, 'grad_norm': 0.8259851932525635, 'learning_rate': 2.221359710692961e-06, 'epoch': 1.82}\n", "{'loss': 0.2418, 'grad_norm': 0.7553776502609253, 'learning_rate': 1.5461356885461075e-06, 'epoch': 1.85}\n", "{'loss': 0.2474, 'grad_norm': 1.2472189664840698, 'learning_rate': 9.913756075728087e-07, 'epoch': 1.88}\n", "{'loss': 0.2323, 'grad_norm': 0.5642313957214355, 'learning_rate': 5.584586887435739e-07, 'epoch': 1.9}\n", "{'loss': 0.239, 'grad_norm': 0.6548100709915161, 'learning_rate': 2.4846123172992954e-07, 'epoch': 1.93}\n", "{'loss': 0.2458, 'grad_norm': 0.706020176410675, 'learning_rate': 6.215393905388278e-08, 'epoch': 1.96}\n", "{'loss': 0.2413, 'grad_norm': 0.7654218077659607, 'learning_rate': 0.0, 'epoch': 1.99}\n", "100%|███████████████████████████████████████| 350/350 [1:10:07<00:00, 9.51s/it][INFO|trainer.py:3819] 2024-09-21 16:57:06,607 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 16:57:06,607 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 16:57:06,607 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-1.5B-Instruct/checkpoint-350\n", "[INFO|configuration_utils.py:733] 2024-09-21 16:58:42,391 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 16:58:42,391 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 16:58:42,429 >> tokenizer config file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-350/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 16:58:42,429 >> Special tokens file saved in saves/Qwen2.5-1.5B-Instruct/checkpoint-350/special_tokens_map.json\n", "[INFO|trainer.py:2394] 2024-09-21 16:58:42,586 >> \n", "\n", "Training completed. Do not forget to share your model on huggingface.co/models =)\n", "\n", "\n", "{'train_runtime': 4303.7472, 'train_samples_per_second': 10.456, 'train_steps_per_second': 0.081, 'train_loss': 0.30218201875686646, 'epoch': 1.99}\n", "100%|███████████████████████████████████████| 350/350 [1:11:43<00:00, 12.30s/it]\n", "[INFO|trainer.py:3503] 2024-09-21 16:58:42,587 >> Saving model checkpoint to saves/Qwen2.5-1.5B-Instruct\n", "[INFO|configuration_utils.py:733] 2024-09-21 16:58:43,138 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B-Instruct/snapshots/5fee7c4ed634dc66c6e318c8ac2897b8b9154536/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 16:58:43,139 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 1536,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 8960,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 28,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 16:58:43,171 >> tokenizer config file saved in saves/Qwen2.5-1.5B-Instruct/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 16:58:43,171 >> Special tokens file saved in saves/Qwen2.5-1.5B-Instruct/special_tokens_map.json\n", "***** train metrics *****\n", " epoch = 1.99\n", " total_flos = 161035698GF\n", " train_loss = 0.3022\n", " train_runtime = 1:11:43.74\n", " train_samples_per_second = 10.456\n", " train_steps_per_second = 0.081\n", "Figure saved at: saves/Qwen2.5-1.5B-Instruct/training_loss.png\n", "Figure saved at: saves/Qwen2.5-1.5B-Instruct/training_eval_loss.png\n", "09/21/2024 16:58:43 - WARNING - llamafactory.extras.ploting - No metric eval_accuracy to plot.\n", "[INFO|trainer.py:3819] 2024-09-21 16:58:43,425 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 16:58:43,425 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 16:58:43,425 >> Batch size = 1\n", "100%|███████████████████████████████████████| 2500/2500 [01:36<00:00, 26.04it/s]\n", "***** eval metrics *****\n", " epoch = 1.99\n", " eval_loss = 0.2388\n", " eval_runtime = 0:01:36.10\n", " eval_samples_per_second = 26.012\n", " eval_steps_per_second = 26.012\n", "[INFO|modelcard.py:449] 2024-09-21 17:00:19,534 >> Dropping the following result as it does not have all the necessary fields:\n", "{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}\n", "Current Directory:\n", "/home/inflaton/code/logical-reasoning/llama-factory\n", "loading env vars from: /home/inflaton/code/logical-reasoning/.env\n", "Adding /home/inflaton/code/logical-reasoning to sys.path\n", "loading /home/inflaton/code/logical-reasoning/llm_toolkit/logical_reasoning_utils.py\n", "Qwen Qwen2.5-0.5B-Instruct qwen config/mgtv_template.yaml ../datasets/mgtv\n", "Writing to config/models/Qwen2.5-0.5B-Instruct.yaml\n", "config/models/Qwen2.5-0.5B-Instruct.yaml:\n", " {\n", " \"model_name_or_path\": \"Qwen/Qwen2.5-0.5B-Instruct\",\n", " \"stage\": \"sft\",\n", " \"do_train\": true,\n", " \"finetuning_type\": \"lora\",\n", " \"lora_target\": \"all\",\n", " \"dataset\": \"alpaca_mgtv_p2\",\n", " \"template\": \"qwen\",\n", " \"cutoff_len\": 8192,\n", " \"max_samples\": 25000,\n", " \"overwrite_cache\": true,\n", " \"preprocessing_num_workers\": 16,\n", " \"output_dir\": \"saves/Qwen2.5-0.5B-Instruct\",\n", " \"logging_steps\": 5,\n", " \"save_steps\": 35,\n", " \"plot_loss\": true,\n", " \"per_device_train_batch_size\": 16,\n", " \"gradient_accumulation_steps\": 8,\n", " \"learning_rate\": 0.0001,\n", " \"num_train_epochs\": 2.0,\n", " \"lr_scheduler_type\": \"cosine\",\n", " \"warmup_ratio\": 0.1,\n", " \"bf16\": true,\n", " \"ddp_timeout\": 180000000,\n", " \"val_size\": 0.1,\n", " \"per_device_eval_batch_size\": 1,\n", " \"eval_strategy\": \"steps\",\n", " \"eval_steps\": 35,\n", " \"report_to\": \"none\",\n", " \"run_name\": \"Qwen2.5-0.5B-Instruct_lora_sft\"\n", "}\n", "09/21/2024 17:00:25 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:00:26,916 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:00:26,917 >> Model config Qwen2Config {\n", " \"_name_or_path\": \"Qwen/Qwen2.5-0.5B-Instruct\",\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 17:00:27,186 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/vocab.json\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 17:00:27,186 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/merges.txt\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 17:00:27,186 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/tokenizer.json\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 17:00:27,186 >> loading file added_tokens.json from cache at None\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 17:00:27,186 >> loading file special_tokens_map.json from cache at None\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 17:00:27,186 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2533] 2024-09-21 17:00:27,298 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:00:28,386 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:00:28,386 >> Model config Qwen2Config {\n", " \"_name_or_path\": \"Qwen/Qwen2.5-0.5B-Instruct\",\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 17:00:28,652 >> loading file vocab.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/vocab.json\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 17:00:28,652 >> loading file merges.txt from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/merges.txt\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 17:00:28,652 >> loading file tokenizer.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/tokenizer.json\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 17:00:28,652 >> loading file added_tokens.json from cache at None\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 17:00:28,652 >> loading file special_tokens_map.json from cache at None\n", "[INFO|tokenization_utils_base.py:2289] 2024-09-21 17:00:28,652 >> loading file tokenizer_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2533] 2024-09-21 17:00:28,744 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "09/21/2024 17:00:28 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>\n", "09/21/2024 17:00:28 - INFO - llamafactory.data.loader - Loading dataset alpaca_mgtv_p2.json...\n", "Converting format of dataset (num_proc=16): 100%|█| 25000/25000 [00:00<00:00, 95\n", "Running tokenizer on dataset (num_proc=16): 100%|█| 25000/25000 [00:01<00:00, 15\n", "training example:\n", "input_ids:\n", "[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 56568, 101909, 108024, 101497, 107969, 99329, 9370, 106040, 1773, 99329, 104190, 104506, 48443, 16, 13, 26853, 224, 57218, 28946, 36993, 101051, 46944, 107969, 27091, 3837, 107969, 27091, 36993, 53481, 46944, 100405, 99518, 104151, 101128, 9370, 57621, 8997, 17, 13, 89982, 68878, 17340, 99392, 107969, 99413, 3837, 107969, 99413, 20412, 107969, 27091, 111230, 8997, 18, 13, 26853, 224, 57218, 28946, 73670, 105396, 99885, 106386, 28330, 86119, 112469, 100246, 57621, 9370, 106538, 8997, 19, 13, 69162, 34204, 103991, 86119, 3837, 106040, 44063, 100345, 107591, 102104, 87752, 105220, 109487, 100653, 5122, 20412, 5373, 99520, 5373, 16530, 99335, 5373, 102104, 88991, 5373, 56007, 24339, 32100, 1773, 99200, 102104, 9370, 104317, 100142, 104506, 28311, 256, 481, 92498, 107969, 27091, 33108, 107969, 99413, 114562, 86119, 111230, 3837, 102104, 5122, 20412, 100631, 99520, 198, 256, 481, 92498, 107969, 27091, 33108, 107969, 99413, 53153, 101041, 100631, 108349, 83751, 63789, 20221, 86119, 111230, 3837, 102104, 5122, 16530, 99335, 198, 256, 481, 92498, 111842, 107666, 113479, 106386, 28330, 86119, 100631, 86119, 104151, 101128, 3837, 102104, 5122, 56007, 24339, 32100, 198, 256, 481, 92498, 111842, 107666, 99797, 108670, 34187, 107969, 99413, 106538, 3837, 102104, 5122, 102104, 88991, 198, 20, 13, 49602, 252, 99590, 15946, 53153, 42855, 99885, 102158, 27369, 3837, 105827, 65770, 99475, 109487, 101047, 110281, 18600, 1773, 77557, 3837, 108620, 99360, 2073, 99520, 854, 65770, 99475, 12857, 2073, 16530, 96332, 14880, 110439, 100001, 104190, 102104, 111842, 101080, 103936, 3407, 334, 107969, 27091, 66963, 73562, 109628, 45629, 105489, 3837, 104133, 111718, 106023, 5122, 101988, 115865, 110731, 9370, 105419, 3837, 115865, 99810, 69249, 59743, 104133, 104003, 115865, 36993, 16530, 101401, 68536, 99723, 3837, 115967, 104270, 102060, 110666, 112031, 1773, 14880, 109363, 115865, 110786, 101423, 104249, 3407, 334, 107969, 99413, 66963, 10236, 250, 253, 48921, 101221, 57218, 101961, 7948, 100894, 9370, 99288, 99818, 101063, 1773, 104269, 99288, 99818, 100774, 13343, 3837, 99798, 57218, 101961, 105664, 102373, 48921, 100271, 1773, 99650, 105616, 18493, 115865, 110731, 9370, 105419, 104388, 1773, 103968, 3837, 102606, 102115, 17340, 3837, 102373, 18493, 106340, 24562, 99774, 82224, 104424, 15946, 99372, 99244, 1773, 110597, 9370, 99288, 99818, 100012, 101416, 63109, 99242, 9370, 102373, 3837, 101988, 101938, 44063, 104003, 115865, 101329, 99314, 3837, 107974, 102373, 9370, 104575, 24562, 3837, 105699, 116418, 100005, 103000, 90663, 1773, 100147, 101070, 105443, 34187, 100097, 3837, 104989, 100833, 69249, 46944, 105190, 9370, 106023, 3407, 334, 111842, 101080, 103936, 66963, 4891, 223, 115, 100623, 21317, 99315, 101037, 198, 151645, 198, 151644, 77091, 198, 99520, 151645]\n", "inputs:\n", "<|im_start|>system\n", "You are a helpful assistant.<|im_end|>\n", "<|im_start|>user\n", "你是一个情景猜谜游戏的主持人。游戏规则如下:\n", "\n", "1. 参与者会得到一个谜面,谜面会描述一个简单又难以理解的事件。\n", "2. 主持人知道谜底,谜底是谜面的答案。\n", "3. 参与者可以询问任何封闭式问题来找寻事件的真相。\n", "4. 对于每个问题,主持人将根据实际情况回答以下五个选项之一:是、不是、不重要、回答正确、问法错误。各回答的判断标准如下:\n", " - 若谜面和谜底能找到问题的答案,回答:是或者不是\n", " - 若谜面和谜底不能直接或者间接推断出问题的答案,回答:不重要\n", " - 若参与者提问不是一个封闭式问题或者问题难以理解,回答:问法错误\n", " - 若参与者提问基本还原了谜底真相,回答:回答正确\n", "5. 回答中不能添加任何其它信息,也不能省略选项中的任何一个字。例如,不可以把“不是”省略成“不”。\n", "\n", "请严格按照这些规则回答参与者提出的问题。\n", "\n", "**谜面:** 在甄家村里,有一个古老的传说:每年南瓜丰收的季节,南瓜田里总有一个最大的南瓜会不翼而飞,村民们对此现象困惑不解。请找出南瓜失踪背后的原因。\n", "\n", "**谜底:** 真相原来与一位年迈的农夫有关。这位农夫年轻时,曾与一位美丽的姑娘相恋。他们约定在南瓜丰收的季节结婚。然而,命运弄人,姑娘在婚礼前的一场意外中离世。悲伤的农夫为了纪念心爱的姑娘,每年都会将最大的南瓜偷走,放到姑娘的墓前,以此寄托自己的哀思。这一行为延续了多年,成为了乡村里一个神秘的传说。\n", "\n", "**参与者提出的问题:** 偷的人信神吗\n", "<|im_end|>\n", "<|im_start|>assistant\n", "不是<|im_end|>\n", "label_ids:\n", "[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 99520, 151645]\n", "labels:\n", "不是<|im_end|>\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:00:32,420 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:00:32,420 >> Model config Qwen2Config {\n", " \"_name_or_path\": \"Qwen/Qwen2.5-0.5B-Instruct\",\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|modeling_utils.py:3634] 2024-09-21 17:00:32,432 >> loading weights file model.safetensors from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/model.safetensors\n", "[INFO|modeling_utils.py:1572] 2024-09-21 17:00:32,436 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.\n", "[INFO|configuration_utils.py:1038] 2024-09-21 17:00:32,437 >> Generate config GenerationConfig {\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645\n", "}\n", "\n", "[INFO|modeling_utils.py:4463] 2024-09-21 17:00:58,518 >> All model checkpoint weights were used when initializing Qwen2ForCausalLM.\n", "\n", "[INFO|modeling_utils.py:4471] 2024-09-21 17:00:58,518 >> All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at Qwen/Qwen2.5-0.5B-Instruct.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2ForCausalLM for predictions without further training.\n", "[INFO|configuration_utils.py:993] 2024-09-21 17:00:58,782 >> loading configuration file generation_config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/generation_config.json\n", "[INFO|configuration_utils.py:1038] 2024-09-21 17:00:58,782 >> Generate config GenerationConfig {\n", " \"bos_token_id\": 151643,\n", " \"do_sample\": true,\n", " \"eos_token_id\": [\n", " 151645,\n", " 151643\n", " ],\n", " \"pad_token_id\": 151643,\n", " \"repetition_penalty\": 1.1,\n", " \"temperature\": 0.7,\n", " \"top_k\": 20,\n", " \"top_p\": 0.8\n", "}\n", "\n", "09/21/2024 17:00:58 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.\n", "09/21/2024 17:00:58 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.\n", "09/21/2024 17:00:58 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.\n", "09/21/2024 17:00:58 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA\n", "09/21/2024 17:00:58 - INFO - llamafactory.model.model_utils.misc - Found linear modules: gate_proj,up_proj,k_proj,o_proj,q_proj,v_proj,down_proj\n", "09/21/2024 17:00:59 - INFO - llamafactory.model.loader - trainable params: 4,399,104 || all params: 498,431,872 || trainable%: 0.8826\n", "[INFO|trainer.py:648] 2024-09-21 17:00:59,124 >> Using auto half precision backend\n", "[INFO|trainer.py:2134] 2024-09-21 17:00:59,440 >> ***** Running training *****\n", "[INFO|trainer.py:2135] 2024-09-21 17:00:59,440 >> Num examples = 22,500\n", "[INFO|trainer.py:2136] 2024-09-21 17:00:59,440 >> Num Epochs = 2\n", "[INFO|trainer.py:2137] 2024-09-21 17:00:59,440 >> Instantaneous batch size per device = 16\n", "[INFO|trainer.py:2140] 2024-09-21 17:00:59,440 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", "[INFO|trainer.py:2141] 2024-09-21 17:00:59,440 >> Gradient Accumulation steps = 8\n", "[INFO|trainer.py:2142] 2024-09-21 17:00:59,440 >> Total optimization steps = 350\n", "[INFO|trainer.py:2143] 2024-09-21 17:00:59,441 >> Number of trainable parameters = 4,399,104\n", "{'loss': 1.1184, 'grad_norm': 15.737517356872559, 'learning_rate': 1.4285714285714285e-05, 'epoch': 0.03}\n", "{'loss': 0.9305, 'grad_norm': 7.921570777893066, 'learning_rate': 2.857142857142857e-05, 'epoch': 0.06}\n", "{'loss': 0.6076, 'grad_norm': 3.3468737602233887, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.09}\n", "{'loss': 0.5437, 'grad_norm': 2.368464708328247, 'learning_rate': 5.714285714285714e-05, 'epoch': 0.11}\n", "{'loss': 0.5106, 'grad_norm': 3.843911647796631, 'learning_rate': 7.142857142857143e-05, 'epoch': 0.14}\n", "{'loss': 0.4692, 'grad_norm': 1.8563235998153687, 'learning_rate': 8.571428571428571e-05, 'epoch': 0.17}\n", "{'loss': 0.4381, 'grad_norm': 1.487918496131897, 'learning_rate': 0.0001, 'epoch': 0.2}\n", " 10%|████▏ | 35/350 [02:29<22:10, 4.22s/it][INFO|trainer.py:3819] 2024-09-21 17:03:28,974 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 17:03:28,974 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 17:03:28,974 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-0.5B-Instruct/checkpoint-35\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:04:52,623 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:04:52,623 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 17:04:52,647 >> tokenizer config file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-35/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 17:04:52,647 >> Special tokens file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-35/special_tokens_map.json\n", "{'loss': 0.4104, 'grad_norm': 2.7992444038391113, 'learning_rate': 9.993784606094612e-05, 'epoch': 0.23}\n", "{'loss': 0.3919, 'grad_norm': 1.2948148250579834, 'learning_rate': 9.975153876827008e-05, 'epoch': 0.26}\n", "{'loss': 0.3958, 'grad_norm': 2.0029242038726807, 'learning_rate': 9.944154131125642e-05, 'epoch': 0.28}\n", "{'loss': 0.3703, 'grad_norm': 1.9004690647125244, 'learning_rate': 9.900862439242719e-05, 'epoch': 0.31}\n", "{'loss': 0.3837, 'grad_norm': 2.839643716812134, 'learning_rate': 9.84538643114539e-05, 'epoch': 0.34}\n", "{'loss': 0.3435, 'grad_norm': 1.7279853820800781, 'learning_rate': 9.777864028930705e-05, 'epoch': 0.37}\n", "{'loss': 0.3556, 'grad_norm': 1.0835622549057007, 'learning_rate': 9.698463103929542e-05, 'epoch': 0.4}\n", " 20%|████████▍ | 70/350 [06:21<19:49, 4.25s/it][INFO|trainer.py:3819] 2024-09-21 17:07:21,360 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 17:07:21,360 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 17:07:21,360 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-0.5B-Instruct/checkpoint-70\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:08:43,882 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:08:43,882 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 17:08:43,903 >> tokenizer config file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-70/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 17:08:43,903 >> Special tokens file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-70/special_tokens_map.json\n", "{'loss': 0.3216, 'grad_norm': 1.249293565750122, 'learning_rate': 9.607381059352038e-05, 'epoch': 0.43}\n", "{'loss': 0.3061, 'grad_norm': 1.9808311462402344, 'learning_rate': 9.504844339512095e-05, 'epoch': 0.45}\n", "{'loss': 0.3325, 'grad_norm': 2.327874183654785, 'learning_rate': 9.391107866851143e-05, 'epoch': 0.48}\n", "{'loss': 0.332, 'grad_norm': 2.0999391078948975, 'learning_rate': 9.266454408160779e-05, 'epoch': 0.51}\n", "{'loss': 0.3349, 'grad_norm': 2.11915922164917, 'learning_rate': 9.131193871579975e-05, 'epoch': 0.54}\n", "{'loss': 0.3162, 'grad_norm': 1.733162760734558, 'learning_rate': 8.985662536114613e-05, 'epoch': 0.57}\n", "{'loss': 0.3228, 'grad_norm': 1.1676844358444214, 'learning_rate': 8.83022221559489e-05, 'epoch': 0.6}\n", " 30%|████████████▎ | 105/350 [10:13<17:14, 4.22s/it][INFO|trainer.py:3819] 2024-09-21 17:11:12,460 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 17:11:12,460 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 17:11:12,460 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-0.5B-Instruct/checkpoint-105\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:12:34,739 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:12:34,739 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 17:12:34,760 >> tokenizer config file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-105/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 17:12:34,760 >> Special tokens file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-105/special_tokens_map.json\n", "{'loss': 0.3165, 'grad_norm': 1.040158748626709, 'learning_rate': 8.665259359149132e-05, 'epoch': 0.63}\n", "{'loss': 0.3019, 'grad_norm': 1.2740882635116577, 'learning_rate': 8.491184090430364e-05, 'epoch': 0.65}\n", "{'loss': 0.3119, 'grad_norm': 1.2684509754180908, 'learning_rate': 8.308429187984297e-05, 'epoch': 0.68}\n", "{'loss': 0.2917, 'grad_norm': 1.1115374565124512, 'learning_rate': 8.117449009293668e-05, 'epoch': 0.71}\n", "{'loss': 0.2885, 'grad_norm': 1.1410064697265625, 'learning_rate': 7.91871836117395e-05, 'epoch': 0.74}\n", "{'loss': 0.3216, 'grad_norm': 2.9339027404785156, 'learning_rate': 7.712731319328798e-05, 'epoch': 0.77}\n", "{'loss': 0.3026, 'grad_norm': 1.4184118509292603, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.8}\n", " 40%|████████████████▍ | 140/350 [14:04<14:55, 4.26s/it][INFO|trainer.py:3819] 2024-09-21 17:15:04,041 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 17:15:04,041 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 17:15:04,041 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-0.5B-Instruct/checkpoint-140\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:16:27,259 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:16:27,260 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 17:16:27,281 >> tokenizer config file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-140/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 17:16:27,281 >> Special tokens file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-140/special_tokens_map.json\n", "{'loss': 0.2798, 'grad_norm': 1.3493735790252686, 'learning_rate': 7.281053286765815e-05, 'epoch': 0.82}\n", "{'loss': 0.3084, 'grad_norm': 1.1956149339675903, 'learning_rate': 7.056435515653059e-05, 'epoch': 0.85}\n", "{'loss': 0.2996, 'grad_norm': 1.1371078491210938, 'learning_rate': 6.826705121831976e-05, 'epoch': 0.88}\n", "{'loss': 0.2936, 'grad_norm': 1.1137551069259644, 'learning_rate': 6.592433251258423e-05, 'epoch': 0.91}\n", "{'loss': 0.2718, 'grad_norm': 2.9275758266448975, 'learning_rate': 6.354202340715026e-05, 'epoch': 0.94}\n", "{'loss': 0.3136, 'grad_norm': 2.6529040336608887, 'learning_rate': 6.112604669781572e-05, 'epoch': 0.97}\n", "{'loss': 0.2802, 'grad_norm': 1.0029186010360718, 'learning_rate': 5.868240888334653e-05, 'epoch': 1.0}\n", " 50%|████████████████████▌ | 175/350 [17:56<12:23, 4.25s/it][INFO|trainer.py:3819] 2024-09-21 17:18:55,953 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 17:18:55,953 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 17:18:55,953 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-0.5B-Instruct/checkpoint-175\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:20:17,688 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:20:17,689 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 17:20:17,710 >> tokenizer config file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-175/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 17:20:17,710 >> Special tokens file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-175/special_tokens_map.json\n", "{'loss': 0.2674, 'grad_norm': 1.5481024980545044, 'learning_rate': 5.621718523237427e-05, 'epoch': 1.02}\n", "{'loss': 0.2629, 'grad_norm': 1.1264318227767944, 'learning_rate': 5.373650467932122e-05, 'epoch': 1.05}\n", "{'loss': 0.2732, 'grad_norm': 1.3646587133407593, 'learning_rate': 5.124653458690365e-05, 'epoch': 1.08}\n", "{'loss': 0.271, 'grad_norm': 0.825769305229187, 'learning_rate': 4.875346541309637e-05, 'epoch': 1.11}\n", "{'loss': 0.2714, 'grad_norm': 0.9982427954673767, 'learning_rate': 4.626349532067879e-05, 'epoch': 1.14}\n", "{'loss': 0.2962, 'grad_norm': 2.239053964614868, 'learning_rate': 4.378281476762576e-05, 'epoch': 1.17}\n", "{'loss': 0.2645, 'grad_norm': 0.8168760538101196, 'learning_rate': 4.131759111665349e-05, 'epoch': 1.19}\n", " 60%|████████████████████████▌ | 210/350 [21:46<09:53, 4.24s/it][INFO|trainer.py:3819] 2024-09-21 17:22:46,303 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 17:22:46,303 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 17:22:46,303 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-0.5B-Instruct/checkpoint-210\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:24:07,801 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:24:07,801 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 17:24:07,827 >> tokenizer config file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-210/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 17:24:07,827 >> Special tokens file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-210/special_tokens_map.json\n", "{'loss': 0.2621, 'grad_norm': 1.397253155708313, 'learning_rate': 3.887395330218429e-05, 'epoch': 1.22}\n", "{'loss': 0.2458, 'grad_norm': 1.1573820114135742, 'learning_rate': 3.6457976592849754e-05, 'epoch': 1.25}\n", "{'loss': 0.2651, 'grad_norm': 2.0793874263763428, 'learning_rate': 3.4075667487415785e-05, 'epoch': 1.28}\n", "{'loss': 0.2654, 'grad_norm': 1.2757207155227661, 'learning_rate': 3.173294878168025e-05, 'epoch': 1.31}\n", "{'loss': 0.263, 'grad_norm': 1.1064047813415527, 'learning_rate': 2.9435644843469436e-05, 'epoch': 1.34}\n", "{'loss': 0.2776, 'grad_norm': 1.7825045585632324, 'learning_rate': 2.718946713234185e-05, 'epoch': 1.36}\n", "{'loss': 0.3012, 'grad_norm': 1.2948428392410278, 'learning_rate': 2.500000000000001e-05, 'epoch': 1.39}\n", " 70%|████████████████████████████▋ | 245/350 [25:37<07:31, 4.30s/it][INFO|trainer.py:3819] 2024-09-21 17:26:36,677 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 17:26:36,678 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 17:26:36,678 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-0.5B-Instruct/checkpoint-245\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:27:57,841 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:27:57,841 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 17:27:57,863 >> tokenizer config file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-245/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 17:27:57,863 >> Special tokens file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-245/special_tokens_map.json\n", "{'loss': 0.2677, 'grad_norm': 1.3902900218963623, 'learning_rate': 2.2872686806712035e-05, 'epoch': 1.42}\n", "{'loss': 0.2483, 'grad_norm': 1.1757906675338745, 'learning_rate': 2.0812816388260518e-05, 'epoch': 1.45}\n", "{'loss': 0.2406, 'grad_norm': 1.2844176292419434, 'learning_rate': 1.8825509907063327e-05, 'epoch': 1.48}\n", "{'loss': 0.2531, 'grad_norm': 1.3673418760299683, 'learning_rate': 1.691570812015704e-05, 'epoch': 1.51}\n", "{'loss': 0.2702, 'grad_norm': 1.176766276359558, 'learning_rate': 1.5088159095696363e-05, 'epoch': 1.54}\n", "{'loss': 0.255, 'grad_norm': 1.8224906921386719, 'learning_rate': 1.3347406408508695e-05, 'epoch': 1.56}\n", "{'loss': 0.2628, 'grad_norm': 1.1993753910064697, 'learning_rate': 1.1697777844051105e-05, 'epoch': 1.59}\n", " 80%|████████████████████████████████▊ | 280/350 [29:27<04:56, 4.24s/it][INFO|trainer.py:3819] 2024-09-21 17:30:26,762 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 17:30:26,762 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 17:30:26,762 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-0.5B-Instruct/checkpoint-280\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:31:47,861 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:31:47,861 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 17:31:47,885 >> tokenizer config file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-280/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 17:31:47,885 >> Special tokens file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-280/special_tokens_map.json\n", "{'loss': 0.2624, 'grad_norm': 1.6778804063796997, 'learning_rate': 1.0143374638853891e-05, 'epoch': 1.62}\n", "{'loss': 0.2604, 'grad_norm': 1.305879831314087, 'learning_rate': 8.688061284200266e-06, 'epoch': 1.65}\n", "{'loss': 0.234, 'grad_norm': 1.0664318799972534, 'learning_rate': 7.33545591839222e-06, 'epoch': 1.68}\n", "{'loss': 0.2584, 'grad_norm': 1.422717571258545, 'learning_rate': 6.088921331488568e-06, 'epoch': 1.71}\n", "{'loss': 0.2521, 'grad_norm': 1.3670940399169922, 'learning_rate': 4.951556604879048e-06, 'epoch': 1.73}\n", "{'loss': 0.2412, 'grad_norm': 1.36295747756958, 'learning_rate': 3.9261894064796135e-06, 'epoch': 1.76}\n", "{'loss': 0.2493, 'grad_norm': 1.4943523406982422, 'learning_rate': 3.0153689607045845e-06, 'epoch': 1.79}\n", " 90%|████████████████████████████████████▉ | 315/350 [33:17<02:29, 4.26s/it][INFO|trainer.py:3819] 2024-09-21 17:34:16,745 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 17:34:16,745 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 17:34:16,745 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-0.5B-Instruct/checkpoint-315\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:35:37,535 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:35:37,536 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 17:35:37,562 >> tokenizer config file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-315/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 17:35:37,562 >> Special tokens file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-315/special_tokens_map.json\n", "{'loss': 0.2384, 'grad_norm': 1.4463350772857666, 'learning_rate': 2.221359710692961e-06, 'epoch': 1.82}\n", "{'loss': 0.2504, 'grad_norm': 1.4378561973571777, 'learning_rate': 1.5461356885461075e-06, 'epoch': 1.85}\n", "{'loss': 0.2644, 'grad_norm': 2.302417278289795, 'learning_rate': 9.913756075728087e-07, 'epoch': 1.88}\n", "{'loss': 0.2634, 'grad_norm': 1.0203107595443726, 'learning_rate': 5.584586887435739e-07, 'epoch': 1.9}\n", "{'loss': 0.2678, 'grad_norm': 1.4266246557235718, 'learning_rate': 2.4846123172992954e-07, 'epoch': 1.93}\n", "{'loss': 0.2724, 'grad_norm': 1.2000150680541992, 'learning_rate': 6.215393905388278e-08, 'epoch': 1.96}\n", "{'loss': 0.2642, 'grad_norm': 1.3346699476242065, 'learning_rate': 0.0, 'epoch': 1.99}\n", "100%|█████████████████████████████████████████| 350/350 [37:04<00:00, 4.17s/it][INFO|trainer.py:3819] 2024-09-21 17:38:04,120 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 17:38:04,120 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 17:38:04,120 >> Batch size = 1\n", "\n", " 0%| | 0/2500 [00:00> Saving model checkpoint to saves/Qwen2.5-0.5B-Instruct/checkpoint-350\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:39:24,176 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:39:24,176 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 17:39:24,198 >> tokenizer config file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-350/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 17:39:24,198 >> Special tokens file saved in saves/Qwen2.5-0.5B-Instruct/checkpoint-350/special_tokens_map.json\n", "[INFO|trainer.py:2394] 2024-09-21 17:39:24,314 >> \n", "\n", "Training completed. Do not forget to share your model on huggingface.co/models =)\n", "\n", "\n", "{'train_runtime': 2304.8725, 'train_samples_per_second': 19.524, 'train_steps_per_second': 0.152, 'train_loss': 0.32685707432883127, 'epoch': 1.99}\n", "100%|█████████████████████████████████████████| 350/350 [38:24<00:00, 6.59s/it]\n", "[INFO|trainer.py:3503] 2024-09-21 17:39:24,314 >> Saving model checkpoint to saves/Qwen2.5-0.5B-Instruct\n", "[INFO|configuration_utils.py:733] 2024-09-21 17:39:24,851 >> loading configuration file config.json from cache at /home/inflaton/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/a8b602d9dafd3a75d382e62757d83d89fca3be54/config.json\n", "[INFO|configuration_utils.py:800] 2024-09-21 17:39:24,851 >> Model config Qwen2Config {\n", " \"architectures\": [\n", " \"Qwen2ForCausalLM\"\n", " ],\n", " \"attention_dropout\": 0.0,\n", " \"bos_token_id\": 151643,\n", " \"eos_token_id\": 151645,\n", " \"hidden_act\": \"silu\",\n", " \"hidden_size\": 896,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 4864,\n", " \"max_position_embeddings\": 32768,\n", " \"max_window_layers\": 21,\n", " \"model_type\": \"qwen2\",\n", " \"num_attention_heads\": 14,\n", " \"num_hidden_layers\": 24,\n", " \"num_key_value_heads\": 2,\n", " \"rms_norm_eps\": 1e-06,\n", " \"rope_theta\": 1000000.0,\n", " \"sliding_window\": null,\n", " \"tie_word_embeddings\": true,\n", " \"torch_dtype\": \"bfloat16\",\n", " \"transformers_version\": \"4.43.3\",\n", " \"use_cache\": true,\n", " \"use_sliding_window\": false,\n", " \"vocab_size\": 151936\n", "}\n", "\n", "[INFO|tokenization_utils_base.py:2702] 2024-09-21 17:39:24,873 >> tokenizer config file saved in saves/Qwen2.5-0.5B-Instruct/tokenizer_config.json\n", "[INFO|tokenization_utils_base.py:2711] 2024-09-21 17:39:24,873 >> Special tokens file saved in saves/Qwen2.5-0.5B-Instruct/special_tokens_map.json\n", "***** train metrics *****\n", " epoch = 1.99\n", " total_flos = 44213382GF\n", " train_loss = 0.3269\n", " train_runtime = 0:38:24.87\n", " train_samples_per_second = 19.524\n", " train_steps_per_second = 0.152\n", "Figure saved at: saves/Qwen2.5-0.5B-Instruct/training_loss.png\n", "Figure saved at: saves/Qwen2.5-0.5B-Instruct/training_eval_loss.png\n", "09/21/2024 17:39:25 - WARNING - llamafactory.extras.ploting - No metric eval_accuracy to plot.\n", "[INFO|trainer.py:3819] 2024-09-21 17:39:25,047 >> \n", "***** Running Evaluation *****\n", "[INFO|trainer.py:3821] 2024-09-21 17:39:25,048 >> Num examples = 2500\n", "[INFO|trainer.py:3824] 2024-09-21 17:39:25,048 >> Batch size = 1\n", "100%|███████████████████████████████████████| 2500/2500 [01:18<00:00, 31.98it/s]\n", "***** eval metrics *****\n", " epoch = 1.99\n", " eval_loss = 0.2634\n", " eval_runtime = 0:01:18.20\n", " eval_samples_per_second = 31.968\n", " eval_steps_per_second = 31.968\n", "[INFO|modelcard.py:449] 2024-09-21 17:40:43,252 >> Dropping the following result as it does not have all the necessary fields:\n", "{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}\n", "CPU times: user 44.7 s, sys: 20.2 s, total: 1min 4s\n", "Wall time: 1h 55min 33s\n" ] } ], "source": [ "%%time\n", "\n", "!./scripts/tune-mgtv-bf16.sh" ] } ], "metadata": { "accelerator": "GPU", "application/vnd.databricks.v1+notebook": { "dashboards": [], "environmentMetadata": null, "language": "python", "notebookMetadata": { "pythonIndentUnit": 4 }, "notebookName": "07_MAC_+_Qwen2-7B-Instructi_Unsloth_train", "widgets": {} }, "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 0 }