|
[INFO|parser.py:325] 2024-09-19 15:01:40,415 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16 |
|
|
|
[INFO|tokenization_utils_base.py:2287] 2024-09-19 15:01:40,419 >> loading file tokenizer.json |
|
|
|
[INFO|tokenization_utils_base.py:2287] 2024-09-19 15:01:40,420 >> loading file added_tokens.json |
|
|
|
[INFO|tokenization_utils_base.py:2287] 2024-09-19 15:01:40,420 >> loading file special_tokens_map.json |
|
|
|
[INFO|tokenization_utils_base.py:2287] 2024-09-19 15:01:40,420 >> loading file tokenizer_config.json |
|
|
|
[INFO|tokenization_utils_base.py:2533] 2024-09-19 15:01:40,699 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
|
[INFO|template.py:270] 2024-09-19 15:01:40,700 >> Replace eos token: <|eot_id|> |
|
|
|
[INFO|template.py:372] 2024-09-19 15:01:40,700 >> Add pad token: <|eot_id|> |
|
|
|
[INFO|loader.py:50] 2024-09-19 15:01:40,700 >> Loading dataset SUSTech/mt_bench_judge... |
|
|
|
[INFO|loader.py:50] 2024-09-19 15:01:44,251 >> Loading dataset Judge.json... |
|
|
|
[INFO|configuration_utils.py:731] 2024-09-19 15:01:44,965 >> loading configuration file /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 15:01:44,966 >> Model config LlamaConfig { |
|
"_name_or_path": "/home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"hidden_act": "silu", |
|
"hidden_size": 4096, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 32, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 8.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[WARNING|rope.py:57] 2024-09-19 15:01:44,967 >> Input length is smaller than max length. Consider increase input length. |
|
|
|
[INFO|rope.py:63] 2024-09-19 15:01:44,967 >> Using linear scaling strategy and setting scaling factor to 1.0 |
|
|
|
[INFO|configuration_utils.py:731] 2024-09-19 15:01:46,306 >> loading configuration file /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 15:01:46,307 >> Model config LlamaConfig { |
|
"_name_or_path": "/home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"hidden_act": "silu", |
|
"hidden_size": 4096, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 32, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 8.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|configuration_utils.py:733] 2024-09-19 15:01:46,943 >> loading configuration file config.json from cache at /home/marl/.cache/huggingface/hub/models--unslothai--other/snapshots/43d9e0f2f19a5d7836895f648dc0e762816acf77/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 15:01:46,943 >> Model config LlamaConfig { |
|
"_name_or_path": "unslothai/other", |
|
"architectures": [ |
|
"LlamaModel" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"hidden_act": "silu", |
|
"hidden_size": 0, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 0, |
|
"max_position_embeddings": 0, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 0, |
|
"num_hidden_layers": 0, |
|
"num_key_value_heads": 0, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 0 |
|
} |
|
|
|
|
|
[INFO|modeling_utils.py:3634] 2024-09-19 15:01:47,465 >> loading weights file model.safetensors from cache at /home/marl/.cache/huggingface/hub/models--unslothai--other/snapshots/43d9e0f2f19a5d7836895f648dc0e762816acf77/model.safetensors |
|
|
|
[INFO|configuration_utils.py:1038] 2024-09-19 15:01:47,466 >> Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2 |
|
} |
|
|
|
|
|
[INFO|configuration_utils.py:733] 2024-09-19 15:01:47,618 >> loading configuration file config.json from cache at /home/marl/.cache/huggingface/hub/models--unslothai--repeat/snapshots/7c48478c02f84ed89f149b0815cc0216ee831fb0/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 15:01:47,619 >> Model config LlamaConfig { |
|
"_name_or_path": "unslothai/repeat", |
|
"architectures": [ |
|
"LlamaModel" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"hidden_act": "silu", |
|
"hidden_size": 0, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 0, |
|
"max_position_embeddings": 0, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 0, |
|
"num_hidden_layers": 0, |
|
"num_key_value_heads": 0, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 0 |
|
} |
|
|
|
|
|
[INFO|modeling_utils.py:3634] 2024-09-19 15:01:47,620 >> loading weights file model.safetensors from cache at /home/marl/.cache/huggingface/hub/models--unslothai--repeat/snapshots/7c48478c02f84ed89f149b0815cc0216ee831fb0/model.safetensors |
|
|
|
[INFO|configuration_utils.py:1038] 2024-09-19 15:01:47,621 >> Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2 |
|
} |
|
|
|
|
|
[INFO|configuration_utils.py:733] 2024-09-19 15:01:48,251 >> loading configuration file config.json from cache at /home/marl/.cache/huggingface/hub/models--unslothai--vram-24/snapshots/61324ceeacd75b2b31f7a789a9c9d82058e6118c/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 15:01:48,252 >> Model config LlamaConfig { |
|
"_name_or_path": "unslothai/vram-24", |
|
"architectures": [ |
|
"LlamaModel" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"hidden_act": "silu", |
|
"hidden_size": 0, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 0, |
|
"max_position_embeddings": 0, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 0, |
|
"num_hidden_layers": 0, |
|
"num_key_value_heads": 0, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 0 |
|
} |
|
|
|
|
|
[INFO|modeling_utils.py:3634] 2024-09-19 15:01:48,630 >> loading weights file model.safetensors from cache at /home/marl/.cache/huggingface/hub/models--unslothai--vram-24/snapshots/61324ceeacd75b2b31f7a789a9c9d82058e6118c/model.safetensors |
|
|
|
[INFO|configuration_utils.py:1038] 2024-09-19 15:01:48,632 >> Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2 |
|
} |
|
|
|
|
|
[INFO|configuration_utils.py:733] 2024-09-19 15:01:49,237 >> loading configuration file config.json from cache at /home/marl/.cache/huggingface/hub/models--unslothai--1/snapshots/7ec782b7604cd9ea0781c23a4270f031650f5617/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 15:01:49,237 >> Model config LlamaConfig { |
|
"_name_or_path": "unslothai/1", |
|
"architectures": [ |
|
"LlamaModel" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 1, |
|
"eos_token_id": 2, |
|
"hidden_act": "silu", |
|
"hidden_size": 0, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 0, |
|
"max_position_embeddings": 2048, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 0, |
|
"num_hidden_layers": 0, |
|
"num_key_value_heads": 0, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-06, |
|
"rope_scaling": null, |
|
"rope_theta": 10000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 0 |
|
} |
|
|
|
|
|
[INFO|modeling_utils.py:3634] 2024-09-19 15:01:49,565 >> loading weights file model.safetensors from cache at /home/marl/.cache/huggingface/hub/models--unslothai--1/snapshots/7ec782b7604cd9ea0781c23a4270f031650f5617/model.safetensors |
|
|
|
[INFO|configuration_utils.py:1038] 2024-09-19 15:01:49,566 >> Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 2 |
|
} |
|
|
|
|
|
[INFO|configuration_utils.py:731] 2024-09-19 15:01:49,567 >> loading configuration file /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 15:01:49,568 >> Model config LlamaConfig { |
|
"_name_or_path": "/home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"hidden_act": "silu", |
|
"hidden_size": 4096, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 32, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 8.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|configuration_utils.py:731] 2024-09-19 15:01:49,582 >> loading configuration file /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-09-19 15:01:49,583 >> Model config LlamaConfig { |
|
"_name_or_path": "/home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"hidden_act": "silu", |
|
"hidden_size": 4096, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 14336, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 32, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 8.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.2", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|modeling_utils.py:3631] 2024-09-19 15:01:49,584 >> loading weights file /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693/model.safetensors.index.json |
|
|
|
[INFO|modeling_utils.py:1572] 2024-09-19 15:01:49,584 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16. |
|
|
|
[INFO|configuration_utils.py:1038] 2024-09-19 15:01:49,585 >> Generate config GenerationConfig { |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
] |
|
} |
|
|
|
|
|
[INFO|modeling_utils.py:4463] 2024-09-19 15:01:56,537 >> All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
|
|
[INFO|modeling_utils.py:4471] 2024-09-19 15:01:56,537 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
|
|
[INFO|configuration_utils.py:991] 2024-09-19 15:01:56,540 >> loading configuration file /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693/generation_config.json |
|
|
|
[INFO|configuration_utils.py:1038] 2024-09-19 15:01:56,541 >> Generate config GenerationConfig { |
|
"bos_token_id": 128000, |
|
"do_sample": true, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"temperature": 0.6, |
|
"top_p": 0.9 |
|
} |
|
|
|
|
|
[WARNING|logging.py:328] 2024-09-19 15:01:56,848 >> Unsloth: We successfully patched the tokenizer to add a {% if add_generation_prompt %} to the chat_template. |
|
This is not a bug, but please notify the Unsloth maintainers - thanks! |
|
|
|
[WARNING|logging.py:328] 2024-09-19 15:01:56,849 >> /home/marl/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3.1-8B-Instruct/snapshots/07eb05b21d191a58c577b4a45982fe0c049d0693 does not have a padding token! Will use pad_token = <|finetune_right_pad_id|>. |
|
|
|
[INFO|checkpointing.py:103] 2024-09-19 15:01:57,352 >> Gradient checkpointing enabled. |
|
|
|
[INFO|adapter.py:302] 2024-09-19 15:01:57,352 >> Upcasting trainable params to float32. |
|
|
|
[INFO|adapter.py:158] 2024-09-19 15:01:57,352 >> Fine-tuning method: DoRA |
|
|
|
[INFO|misc.py:51] 2024-09-19 15:01:57,353 >> Found linear modules: k_proj,v_proj,gate_proj,down_proj,q_proj,o_proj,up_proj |
|
|
|
[WARNING|logging.py:328] 2024-09-19 15:01:58,892 >> Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters |
|
are not enabled or a bias term (like in Qwen) is used. |
|
|
|
[WARNING|logging.py:328] 2024-09-19 15:01:58,892 >> Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters |
|
are not enabled or a bias term (like in Qwen) is used. |
|
|
|
[WARNING|logging.py:328] 2024-09-19 15:01:58,892 >> Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters |
|
are not enabled or a bias term (like in Qwen) is used. |
|
|
|
[WARNING|logging.py:328] 2024-09-19 15:01:58,893 >> Unsloth 2024.9 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers. |
|
|
|
[INFO|loader.py:196] 2024-09-19 15:01:59,917 >> trainable params: 43,319,296 || all params: 8,073,580,544 || trainable%: 0.5366 |
|
|
|
[INFO|trainer.py:648] 2024-09-19 15:01:59,932 >> Using auto half precision backend |
|
|
|
[WARNING|<string>:213] 2024-09-19 15:02:00,225 >> ==((====))== Unsloth - 2x faster free finetuning | Num GPUs = 1 |
|
\\ /| Num examples = 830 | Num Epochs = 10 |
|
O^O/ \_/ \ Batch size per device = 2 | Gradient Accumulation steps = 8 |
|
\ / Total batch size = 16 | Total steps = 510 |
|
"-____-" Number of trainable parameters = 43,319,296 |
|
|
|
[INFO|callbacks.py:137] 2024-09-19 15:02:00,753 >> Initial PiSSA adatper will be saved at: saves/LLaMA3-8B-Chat/lora/JudgePierce/pissa_init. |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:05:11,452 >> {'loss': 0.8996, 'learning_rate': 4.9988e-05, 'epoch': 0.10, 'throughput': 411.79} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:08:29,947 >> {'loss': 0.6806, 'learning_rate': 4.9953e-05, 'epoch': 0.19, 'throughput': 408.84} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:11:13,747 >> {'loss': 0.6180, 'learning_rate': 4.9893e-05, 'epoch': 0.29, 'throughput': 404.26} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:14:02,211 >> {'loss': 0.5594, 'learning_rate': 4.9811e-05, 'epoch': 0.39, 'throughput': 403.27} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:17:18,092 >> {'loss': 0.5253, 'learning_rate': 4.9704e-05, 'epoch': 0.48, 'throughput': 396.87} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:20:42,613 >> {'loss': 0.4794, 'learning_rate': 4.9574e-05, 'epoch': 0.58, 'throughput': 392.36} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:23:53,577 >> {'loss': 0.4546, 'learning_rate': 4.9421e-05, 'epoch': 0.67, 'throughput': 393.95} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:26:56,955 >> {'loss': 0.4703, 'learning_rate': 4.9245e-05, 'epoch': 0.77, 'throughput': 395.28} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:29:57,393 >> {'loss': 0.4407, 'learning_rate': 4.9046e-05, 'epoch': 0.87, 'throughput': 395.30} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:32:59,265 >> {'loss': 0.4425, 'learning_rate': 4.8824e-05, 'epoch': 0.96, 'throughput': 395.36} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:36:03,653 >> {'loss': 0.4081, 'learning_rate': 4.8579e-05, 'epoch': 1.06, 'throughput': 395.68} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:39:11,834 >> {'loss': 0.4002, 'learning_rate': 4.8312e-05, 'epoch': 1.16, 'throughput': 395.63} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:41:57,790 >> {'loss': 0.3587, 'learning_rate': 4.8023e-05, 'epoch': 1.25, 'throughput': 395.35} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:45:01,971 >> {'loss': 0.3560, 'learning_rate': 4.7712e-05, 'epoch': 1.35, 'throughput': 395.98} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:47:56,540 >> {'loss': 0.3639, 'learning_rate': 4.7379e-05, 'epoch': 1.45, 'throughput': 396.16} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:51:00,164 >> {'loss': 0.3288, 'learning_rate': 4.7025e-05, 'epoch': 1.54, 'throughput': 395.91} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:54:25,445 >> {'loss': 0.3563, 'learning_rate': 4.6651e-05, 'epoch': 1.64, 'throughput': 396.14} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 15:57:21,312 >> {'loss': 0.3642, 'learning_rate': 4.6255e-05, 'epoch': 1.73, 'throughput': 396.03} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:00:32,266 >> {'loss': 0.3407, 'learning_rate': 4.5840e-05, 'epoch': 1.83, 'throughput': 396.47} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:03:50,197 >> {'loss': 0.3216, 'learning_rate': 4.5405e-05, 'epoch': 1.93, 'throughput': 396.56} |
|
|
|
[INFO|trainer.py:3503] 2024-09-19 16:03:50,197 >> Saving model checkpoint to saves/LLaMA3-8B-Chat/lora/JudgePierce/checkpoint-100 |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:07:13,059 >> {'loss': 0.3197, 'learning_rate': 4.4950e-05, 'epoch': 2.02, 'throughput': 395.40} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:10:02,255 >> {'loss': 0.2904, 'learning_rate': 4.4477e-05, 'epoch': 2.12, 'throughput': 394.79} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:13:06,914 >> {'loss': 0.2498, 'learning_rate': 4.3985e-05, 'epoch': 2.22, 'throughput': 394.39} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:16:07,251 >> {'loss': 0.2690, 'learning_rate': 4.3475e-05, 'epoch': 2.31, 'throughput': 393.99} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:19:28,568 >> {'loss': 0.2838, 'learning_rate': 4.2948e-05, 'epoch': 2.41, 'throughput': 393.80} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:22:40,612 >> {'loss': 0.2466, 'learning_rate': 4.2403e-05, 'epoch': 2.51, 'throughput': 393.67} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:25:42,935 >> {'loss': 0.2272, 'learning_rate': 4.1842e-05, 'epoch': 2.60, 'throughput': 393.15} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:29:16,509 >> {'loss': 0.2484, 'learning_rate': 4.1265e-05, 'epoch': 2.70, 'throughput': 393.10} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:32:12,701 >> {'loss': 0.2339, 'learning_rate': 4.0673e-05, 'epoch': 2.80, 'throughput': 392.63} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:35:55,248 >> {'loss': 0.2597, 'learning_rate': 4.0066e-05, 'epoch': 2.89, 'throughput': 392.84} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:38:50,490 >> {'loss': 0.2525, 'learning_rate': 3.9444e-05, 'epoch': 2.99, 'throughput': 392.73} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:41:47,937 >> {'loss': 0.1832, 'learning_rate': 3.8809e-05, 'epoch': 3.08, 'throughput': 392.41} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:44:58,684 >> {'loss': 0.1519, 'learning_rate': 3.8161e-05, 'epoch': 3.18, 'throughput': 392.60} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:47:53,818 >> {'loss': 0.1707, 'learning_rate': 3.7500e-05, 'epoch': 3.28, 'throughput': 392.58} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:51:13,288 >> {'loss': 0.1603, 'learning_rate': 3.6827e-05, 'epoch': 3.37, 'throughput': 392.77} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:54:40,189 >> {'loss': 0.1683, 'learning_rate': 3.6143e-05, 'epoch': 3.47, 'throughput': 393.19} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 16:57:28,484 >> {'loss': 0.1734, 'learning_rate': 3.5449e-05, 'epoch': 3.57, 'throughput': 392.80} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:00:50,184 >> {'loss': 0.1543, 'learning_rate': 3.4745e-05, 'epoch': 3.66, 'throughput': 392.50} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:04:08,221 >> {'loss': 0.1601, 'learning_rate': 3.4031e-05, 'epoch': 3.76, 'throughput': 392.93} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:06:58,844 >> {'loss': 0.1911, 'learning_rate': 3.3309e-05, 'epoch': 3.86, 'throughput': 393.04} |
|
|
|
[INFO|trainer.py:3503] 2024-09-19 17:06:58,845 >> Saving model checkpoint to saves/LLaMA3-8B-Chat/lora/JudgePierce/checkpoint-200 |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:09:52,758 >> {'loss': 0.1741, 'learning_rate': 3.2579e-05, 'epoch': 3.95, 'throughput': 393.20} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:12:42,126 >> {'loss': 0.1334, 'learning_rate': 3.1842e-05, 'epoch': 4.05, 'throughput': 393.35} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:16:01,091 >> {'loss': 0.1027, 'learning_rate': 3.1098e-05, 'epoch': 4.14, 'throughput': 393.86} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:19:01,299 >> {'loss': 0.0955, 'learning_rate': 3.0348e-05, 'epoch': 4.24, 'throughput': 394.08} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:22:03,579 >> {'loss': 0.1006, 'learning_rate': 2.9594e-05, 'epoch': 4.34, 'throughput': 394.31} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:24:44,568 >> {'loss': 0.1007, 'learning_rate': 2.8835e-05, 'epoch': 4.43, 'throughput': 394.43} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:27:55,254 >> {'loss': 0.1033, 'learning_rate': 2.8072e-05, 'epoch': 4.53, 'throughput': 394.76} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:30:55,686 >> {'loss': 0.1023, 'learning_rate': 2.7307e-05, 'epoch': 4.63, 'throughput': 395.01} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:34:01,751 >> {'loss': 0.1121, 'learning_rate': 2.6539e-05, 'epoch': 4.72, 'throughput': 395.28} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:37:01,776 >> {'loss': 0.1028, 'learning_rate': 2.5770e-05, 'epoch': 4.82, 'throughput': 395.48} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:40:04,358 >> {'loss': 0.1055, 'learning_rate': 2.5000e-05, 'epoch': 4.92, 'throughput': 395.69} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:42:51,693 >> {'loss': 0.1037, 'learning_rate': 2.4230e-05, 'epoch': 5.01, 'throughput': 395.79} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:45:59,478 >> {'loss': 0.0633, 'learning_rate': 2.3461e-05, 'epoch': 5.11, 'throughput': 396.05} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:49:02,492 >> {'loss': 0.0521, 'learning_rate': 2.2693e-05, 'epoch': 5.20, 'throughput': 396.24} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:52:17,309 >> {'loss': 0.0625, 'learning_rate': 2.1928e-05, 'epoch': 5.30, 'throughput': 396.59} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:55:15,619 >> {'loss': 0.0590, 'learning_rate': 2.1165e-05, 'epoch': 5.40, 'throughput': 396.74} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 17:57:57,249 >> {'loss': 0.0648, 'learning_rate': 2.0406e-05, 'epoch': 5.49, 'throughput': 396.77} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 18:00:56,018 >> {'loss': 0.0709, 'learning_rate': 1.9652e-05, 'epoch': 5.59, 'throughput': 396.92} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 18:03:45,922 >> {'loss': 0.0641, 'learning_rate': 1.8902e-05, 'epoch': 5.69, 'throughput': 396.97} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 18:06:55,209 >> {'loss': 0.0689, 'learning_rate': 1.8158e-05, 'epoch': 5.78, 'throughput': 397.19} |
|
|
|
[INFO|trainer.py:3503] 2024-09-19 18:06:55,210 >> Saving model checkpoint to saves/LLaMA3-8B-Chat/lora/JudgePierce/checkpoint-300 |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 18:09:56,064 >> {'loss': 0.0550, 'learning_rate': 1.7421e-05, 'epoch': 5.88, 'throughput': 397.36} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 18:12:55,928 >> {'loss': 0.0626, 'learning_rate': 1.6691e-05, 'epoch': 5.98, 'throughput': 397.56} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 18:15:57,052 >> {'loss': 0.0465, 'learning_rate': 1.5969e-05, 'epoch': 6.07, 'throughput': 397.77} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 18:19:01,654 >> {'loss': 0.0421, 'learning_rate': 1.5255e-05, 'epoch': 6.17, 'throughput': 397.98} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 18:21:57,727 >> {'loss': 0.0303, 'learning_rate': 1.4551e-05, 'epoch': 6.27, 'throughput': 398.18} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 18:25:07,621 >> {'loss': 0.0337, 'learning_rate': 1.3857e-05, 'epoch': 6.36, 'throughput': 398.43} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 18:28:04,937 >> {'loss': 0.0333, 'learning_rate': 1.3173e-05, 'epoch': 6.46, 'throughput': 398.54} |
|
|
|
[INFO|callbacks.py:310] 2024-09-19 18:31:08,310 >> {'loss': 0.0390, 'learning_rate': 1.2500e-05, 'epoch': 6.55, 'throughput': 398.74} |
|
|
|
[INFO|<string>:478] 2024-09-19 18:33:16,716 >> |
|
|
|
Training completed. Do not forget to share your model on huggingface.co/models =) |
|
|
|
|
|
|
|
[INFO|callbacks.py:153] 2024-09-19 18:33:16,718 >> Converted PiSSA adapter will be saved at: saves/LLaMA3-8B-Chat/lora/JudgePierce/pissa_converted. |
|
|
|
[INFO|trainer.py:3503] 2024-09-19 18:33:18,563 >> Saving model checkpoint to saves/LLaMA3-8B-Chat/lora/JudgePierce |
|
|
|
[WARNING|ploting.py:89] 2024-09-19 18:33:18,991 >> No metric eval_loss to plot. |
|
|
|
[WARNING|ploting.py:89] 2024-09-19 18:33:18,991 >> No metric eval_accuracy to plot. |
|
|
|
[INFO|modelcard.py:449] 2024-09-19 18:33:18,992 >> Dropping the following result as it does not have all the necessary fields: |
|
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}} |
|
|
|
|