update model checkpoints

Browse files

Files changed (11) hide show

README.md +19 -22
adapter_config.json +4 -9
adapter_model.safetensors +2 -2
all_results.json +11 -11
config.json +3 -3
eval_results.json +5 -5
runs/Jun13_06-11-24_gpu1-2/events.out.tfevents.1718230372.gpu1-2.1135362.0 +3 -0
runs/Jun13_06-11-24_gpu1-2/events.out.tfevents.1718231704.gpu1-2.1135362.1 +3 -0
train_results.json +7 -7
trainer_state.json +533 -2605
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -1,14 +1,11 @@
 ---
-license: gemma
 library_name: peft
 tags:
 - alignment-handbook
-- trl
-- sft
 - generated_from_trainer
-base_model: google/gemma-7b
 datasets:
 - llama-duo/synth_summarize_dataset_dedup
 model-index:
 - name: gemma7b-summarize-gpt4o-32k
   results: []
@@ -21,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset_dedup dataset.
 It achieves the following results on the evaluation set:
-- Loss: 4.7170
 ## Model description
@@ -42,13 +39,13 @@ More information needed
 The following hyperparameters were used during training:
 - learning_rate: 0.0002
 - train_batch_size: 4
-- eval_batch_size: 4
 - seed: 42
 - distributed_type: multi-GPU
-- num_devices: 2
 - gradient_accumulation_steps: 2
-- total_train_batch_size: 16
-- total_eval_batch_size: 8
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
@@ -56,24 +53,24 @@ The following hyperparameters were used during training:
 ### Training results
-| Training Loss | Epoch  | Step | Validation Loss |
-|:-------------:|:------:|:----:|:---------------:|
-| 1.0031        | 0.9975 | 203  | 2.5299          |
-| 0.8685        | 2.0    | 407  | 2.4260          |
-| 0.8           | 2.9975 | 610  | 2.5051          |
-| 0.6938        | 4.0    | 814  | 2.6558          |
-| 0.5865        | 4.9975 | 1017 | 2.9163          |
-| 0.4821        | 6.0    | 1221 | 3.3285          |
-| 0.3899        | 6.9975 | 1424 | 3.8984          |
-| 0.3321        | 8.0    | 1628 | 4.4348          |
-| 0.3089        | 8.9975 | 1831 | 4.6891          |
-| 0.3016        | 9.9754 | 2030 | 4.7170          |
 ### Framework versions
 - PEFT 0.10.0
 - Transformers 4.40.0
-- Pytorch 2.2.1+cu121
 - Datasets 2.18.0
 - Tokenizers 0.19.1

 ---
 library_name: peft
 tags:
 - alignment-handbook
 - generated_from_trainer
 datasets:
 - llama-duo/synth_summarize_dataset_dedup
+base_model: google/gemma-7b
 model-index:
 - name: gemma7b-summarize-gpt4o-32k
   results: []
 This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the llama-duo/synth_summarize_dataset_dedup dataset.
 It achieves the following results on the evaluation set:
+- Loss: 2.5602
 ## Model description
 The following hyperparameters were used during training:
 - learning_rate: 0.0002
 - train_batch_size: 4
+- eval_batch_size: 2
 - seed: 42
 - distributed_type: multi-GPU
+- num_devices: 8
 - gradient_accumulation_steps: 2
+- total_train_batch_size: 64
+- total_eval_batch_size: 16
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
 ### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 3.5798        | 1.0   | 55   | 4.0590          |
+| 1.346         | 2.0   | 110  | 2.6944          |
+| 1.1944        | 3.0   | 165  | 2.6027          |
+| 1.1119        | 4.0   | 220  | 2.5779          |
+| 1.0741        | 5.0   | 275  | 2.5495          |
+| 1.0435        | 6.0   | 330  | 2.5501          |
+| 1.0191        | 7.0   | 385  | 2.5536          |
+| 0.9965        | 8.0   | 440  | 2.5604          |
+| 0.9986        | 9.0   | 495  | 2.5597          |
+| 0.9948        | 10.0  | 550  | 2.5602          |
 ### Framework versions
 - PEFT 0.10.0
 - Transformers 4.40.0
+- Pytorch 2.1.2+cu121
 - Datasets 2.18.0
 - Tokenizers 0.19.1

adapter_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "alpha_pattern": {},
   "auto_mapping": null,
-  "base_model_name_or_path": "google/gemma-7b",
   "bias": "none",
   "fan_in_fan_out": false,
   "inference_mode": true,
@@ -10,23 +10,18 @@
   "layers_pattern": null,
   "layers_to_transform": null,
   "loftq_config": {},
-  "lora_alpha": 16,
   "lora_dropout": 0.05,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
-  "r": 8,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "v_proj",
-    "k_proj",
-    "o_proj",
-    "down_proj",
-    "up_proj",
-    "q_proj",
-    "gate_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

 {
   "alpha_pattern": {},
   "auto_mapping": null,
+  "base_model_name_or_path": "/hpc2hdd/home/fwang380/OpenSource/Models/gemma-7b",
   "bias": "none",
   "fan_in_fan_out": false,
   "inference_mode": true,
   "layers_pattern": null,
   "layers_to_transform": null,
   "loftq_config": {},
+  "lora_alpha": 32,
   "lora_dropout": 0.05,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "peft_type": "LORA",
+  "r": 16,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "v_proj",
+    "q_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:125f331ca9a8684aec05b7fd7ab37fb0f3ec43d64414a7c4aa9dfc598b180031
-size 50056096

 version https://git-lfs.github.com/spec/v1
+oid sha256:6bf418905f7800a9c225c813c0bcb18b5fabd4cf79b2b3fb4b806ccb3e38eb8f
+size 12860096

all_results.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
-    "epoch": 9.975429975429975,
-    "eval_loss": 4.717045307159424,
-    "eval_runtime": 2.0634,
     "eval_samples": 25,
-    "eval_samples_per_second": 4.846,
-    "eval_steps_per_second": 0.969,
-    "total_flos": 1.5518062706111283e+18,
-    "train_loss": 1.3545548074938394,
-    "train_runtime": 13205.4864,
-    "train_samples": 29997,
-    "train_samples_per_second": 2.465,
-    "train_steps_per_second": 0.154
 }

 {
+    "epoch": 10.0,
+    "eval_loss": 2.5602283477783203,
+    "eval_runtime": 0.2344,
     "eval_samples": 25,
+    "eval_samples_per_second": 42.666,
+    "eval_steps_per_second": 4.267,
+    "total_flos": 1.6777423328808796e+18,
+    "train_loss": 3.202145513187755,
+    "train_runtime": 1331.6624,
+    "train_samples": 32305,
+    "train_samples_per_second": 26.313,
+    "train_steps_per_second": 0.413
 }

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "google/gemma-7b",
   "architectures": [
     "GemmaForCausalLM"
   ],
@@ -23,9 +23,9 @@
     "_load_in_4bit": true,
     "_load_in_8bit": false,
     "bnb_4bit_compute_dtype": "bfloat16",
-    "bnb_4bit_quant_storage": "bfloat16",
     "bnb_4bit_quant_type": "nf4",
-    "bnb_4bit_use_double_quant": true,
     "llm_int8_enable_fp32_cpu_offload": false,
     "llm_int8_has_fp16_weight": false,
     "llm_int8_skip_modules": null,

 {
+  "_name_or_path": "/hpc2hdd/home/fwang380/OpenSource/Models/gemma-7b",
   "architectures": [
     "GemmaForCausalLM"
   ],
     "_load_in_4bit": true,
     "_load_in_8bit": false,
     "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_storage": "uint8",
     "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": false,
     "llm_int8_enable_fp32_cpu_offload": false,
     "llm_int8_has_fp16_weight": false,
     "llm_int8_skip_modules": null,

eval_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 9.975429975429975,
-    "eval_loss": 4.717045307159424,
-    "eval_runtime": 2.0634,
     "eval_samples": 25,
-    "eval_samples_per_second": 4.846,
-    "eval_steps_per_second": 0.969
 }

 {
+    "epoch": 10.0,
+    "eval_loss": 2.5602283477783203,
+    "eval_runtime": 0.2344,
     "eval_samples": 25,
+    "eval_samples_per_second": 42.666,
+    "eval_steps_per_second": 4.267
 }

runs/Jun13_06-11-24_gpu1-2/events.out.tfevents.1718230372.gpu1-2.1135362.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d67de193adcb4d80b1f0b1ae3a9884c43fa718e5a324b60954b7ef2e445aa18
+size 31822

runs/Jun13_06-11-24_gpu1-2/events.out.tfevents.1718231704.gpu1-2.1135362.1 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54628faf2b0a0a9c2d9de830029cec936a75a4545f08d51e8c5d9000dee67dbc
+size 359

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 9.975429975429975,
-    "total_flos": 1.5518062706111283e+18,
-    "train_loss": 1.3545548074938394,
-    "train_runtime": 13205.4864,
-    "train_samples": 29997,
-    "train_samples_per_second": 2.465,
-    "train_steps_per_second": 0.154
 }

 {
+    "epoch": 10.0,
+    "total_flos": 1.6777423328808796e+18,
+    "train_loss": 3.202145513187755,
+    "train_runtime": 1331.6624,
+    "train_samples": 32305,
+    "train_samples_per_second": 26.313,
+    "train_steps_per_second": 0.413
 }

trainer_state.json CHANGED Viewed

@@ -1,2958 +1,886 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 9.975429975429975,
   "eval_steps": 500,
-  "global_step": 2030,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.004914004914004914,
-      "grad_norm": 552.0,
-      "learning_rate": 9.852216748768474e-07,
-      "loss": 51.7129,
       "step": 1
     },
     {
-      "epoch": 0.02457002457002457,
-      "grad_norm": 404.0,
-      "learning_rate": 4.926108374384237e-06,
-      "loss": 45.7069,
       "step": 5
     },
     {
-      "epoch": 0.04914004914004914,
-      "grad_norm": 388.0,
-      "learning_rate": 9.852216748768475e-06,
-      "loss": 46.9702,
       "step": 10
     },
     {
-      "epoch": 0.07371007371007371,
-      "grad_norm": 292.0,
-      "learning_rate": 1.4778325123152711e-05,
-      "loss": 37.466,
       "step": 15
     },
     {
-      "epoch": 0.09828009828009827,
-      "grad_norm": 79.5,
-      "learning_rate": 1.970443349753695e-05,
-      "loss": 28.4521,
       "step": 20
     },
     {
-      "epoch": 0.12285012285012285,
-      "grad_norm": 47.25,
-      "learning_rate": 2.4630541871921184e-05,
-      "loss": 23.8617,
       "step": 25
     },
     {
-      "epoch": 0.14742014742014742,
-      "grad_norm": 26.625,
-      "learning_rate": 2.9556650246305422e-05,
-      "loss": 22.4994,
       "step": 30
     },
     {
-      "epoch": 0.171990171990172,
-      "grad_norm": 12.625,
-      "learning_rate": 3.4482758620689657e-05,
-      "loss": 20.8855,
       "step": 35
     },
     {
-      "epoch": 0.19656019656019655,
-      "grad_norm": 7.84375,
-      "learning_rate": 3.94088669950739e-05,
-      "loss": 19.2048,
       "step": 40
     },
     {
-      "epoch": 0.22113022113022113,
-      "grad_norm": 8.5,
-      "learning_rate": 4.433497536945813e-05,
-      "loss": 18.9068,
       "step": 45
     },
     {
-      "epoch": 0.2457002457002457,
-      "grad_norm": 12.625,
-      "learning_rate": 4.926108374384237e-05,
-      "loss": 17.7765,
       "step": 50
     },
     {
-      "epoch": 0.2702702702702703,
-      "grad_norm": 24.875,
-      "learning_rate": 5.41871921182266e-05,
-      "loss": 16.5829,
       "step": 55
     },
     {
-      "epoch": 0.29484029484029484,
-      "grad_norm": 76.5,
-      "learning_rate": 5.9113300492610844e-05,
-      "loss": 13.7055,
       "step": 60
     },
     {
-      "epoch": 0.3194103194103194,
-      "grad_norm": 52.25,
-      "learning_rate": 6.403940886699507e-05,
-      "loss": 7.5296,
       "step": 65
     },
     {
-      "epoch": 0.343980343980344,
-      "grad_norm": 11.9375,
-      "learning_rate": 6.896551724137931e-05,
-      "loss": 2.7412,
       "step": 70
     },
     {
-      "epoch": 0.36855036855036855,
-      "grad_norm": 2.984375,
-      "learning_rate": 7.389162561576355e-05,
-      "loss": 2.1519,
       "step": 75
     },
     {
-      "epoch": 0.3931203931203931,
-      "grad_norm": 2.625,
-      "learning_rate": 7.88177339901478e-05,
-      "loss": 1.849,
       "step": 80
     },
     {
-      "epoch": 0.4176904176904177,
-      "grad_norm": 3.21875,
-      "learning_rate": 8.374384236453202e-05,
-      "loss": 1.6847,
       "step": 85
     },
     {
-      "epoch": 0.44226044226044225,
-      "grad_norm": 2.5625,
-      "learning_rate": 8.866995073891627e-05,
-      "loss": 1.528,
       "step": 90
     },
     {
-      "epoch": 0.4668304668304668,
-      "grad_norm": 5.65625,
-      "learning_rate": 9.35960591133005e-05,
-      "loss": 1.4458,
       "step": 95
     },
     {
-      "epoch": 0.4914004914004914,
-      "grad_norm": 1.96875,
-      "learning_rate": 9.852216748768474e-05,
-      "loss": 1.3912,
       "step": 100
     },
     {
-      "epoch": 0.515970515970516,
-      "grad_norm": 4.34375,
-      "learning_rate": 0.00010344827586206898,
-      "loss": 1.3435,
       "step": 105
     },
     {
-      "epoch": 0.5405405405405406,
-      "grad_norm": 1.484375,
-      "learning_rate": 0.0001083743842364532,
-      "loss": 1.2916,
       "step": 110
     },
     {
-      "epoch": 0.5651105651105651,
-      "grad_norm": 2.828125,
-      "learning_rate": 0.00011330049261083743,
-      "loss": 1.268,
       "step": 115
     },
     {
-      "epoch": 0.5896805896805897,
-      "grad_norm": 8.75,
-      "learning_rate": 0.00011822660098522169,
-      "loss": 1.2419,
       "step": 120
     },
     {
-      "epoch": 0.6142506142506142,
-      "grad_norm": 9.875,
-      "learning_rate": 0.00012315270935960593,
-      "loss": 1.1848,
       "step": 125
     },
     {
-      "epoch": 0.6388206388206388,
-      "grad_norm": 2.859375,
-      "learning_rate": 0.00012807881773399014,
-      "loss": 1.1821,
       "step": 130
     },
     {
-      "epoch": 0.6633906633906634,
-      "grad_norm": 2.203125,
-      "learning_rate": 0.00013300492610837438,
-      "loss": 1.1308,
       "step": 135
     },
     {
-      "epoch": 0.687960687960688,
-      "grad_norm": 3.453125,
-      "learning_rate": 0.00013793103448275863,
-      "loss": 1.1049,
       "step": 140
     },
     {
-      "epoch": 0.7125307125307125,
-      "grad_norm": 5.53125,
-      "learning_rate": 0.00014285714285714287,
-      "loss": 1.0904,
       "step": 145
     },
     {
-      "epoch": 0.7371007371007371,
-      "grad_norm": 36.75,
-      "learning_rate": 0.0001477832512315271,
-      "loss": 1.0884,
       "step": 150
     },
     {
-      "epoch": 0.7616707616707616,
-      "grad_norm": 2.1875,
-      "learning_rate": 0.00015270935960591132,
-      "loss": 1.0498,
       "step": 155
     },
     {
-      "epoch": 0.7862407862407862,
-      "grad_norm": 3.734375,
-      "learning_rate": 0.0001576354679802956,
-      "loss": 1.0656,
       "step": 160
     },
     {
-      "epoch": 0.8108108108108109,
-      "grad_norm": 2.71875,
-      "learning_rate": 0.0001625615763546798,
-      "loss": 1.0436,
       "step": 165
     },
     {
-      "epoch": 0.8353808353808354,
-      "grad_norm": 3.03125,
-      "learning_rate": 0.00016748768472906405,
-      "loss": 1.0507,
       "step": 170
     },
     {
-      "epoch": 0.85995085995086,
-      "grad_norm": 2.109375,
-      "learning_rate": 0.00017241379310344826,
-      "loss": 1.0593,
       "step": 175
     },
     {
-      "epoch": 0.8845208845208845,
-      "grad_norm": 28.875,
-      "learning_rate": 0.00017733990147783253,
-      "loss": 1.0432,
       "step": 180
     },
     {
-      "epoch": 0.9090909090909091,
-      "grad_norm": 1.4609375,
-      "learning_rate": 0.00018226600985221675,
-      "loss": 1.0558,
       "step": 185
     },
     {
-      "epoch": 0.9336609336609336,
-      "grad_norm": 3.796875,
-      "learning_rate": 0.000187192118226601,
-      "loss": 1.0305,
       "step": 190
     },
     {
-      "epoch": 0.9582309582309583,
-      "grad_norm": 1.9140625,
-      "learning_rate": 0.00019211822660098523,
-      "loss": 1.0315,
       "step": 195
     },
     {
-      "epoch": 0.9828009828009828,
-      "grad_norm": 3.28125,
-      "learning_rate": 0.00019704433497536947,
-      "loss": 1.0031,
       "step": 200
     },
     {
-      "epoch": 0.9975429975429976,
-      "eval_loss": 2.529916286468506,
-      "eval_runtime": 2.0529,
-      "eval_samples_per_second": 4.871,
-      "eval_steps_per_second": 0.974,
-      "step": 203
-    },
-    {
-      "epoch": 1.0073710073710074,
-      "grad_norm": 9.4375,
-      "learning_rate": 0.00019999940863962815,
-      "loss": 1.0212,
       "step": 205
     },
     {
-      "epoch": 1.031941031941032,
-      "grad_norm": 1.421875,
-      "learning_rate": 0.00019999275591576766,
-      "loss": 0.9729,
       "step": 210
     },
     {
-      "epoch": 1.0565110565110565,
-      "grad_norm": 5.875,
-      "learning_rate": 0.00019997871176098827,
-      "loss": 0.9684,
       "step": 215
     },
     {
-      "epoch": 1.0810810810810811,
-      "grad_norm": 3.328125,
-      "learning_rate": 0.00019995727721342914,
-      "loss": 0.9533,
       "step": 220
     },
     {
-      "epoch": 1.1056511056511056,
-      "grad_norm": 1.1953125,
-      "learning_rate": 0.00019992845385752485,
-      "loss": 0.9418,
       "step": 225
     },
     {
-      "epoch": 1.1302211302211302,
-      "grad_norm": 2.3125,
-      "learning_rate": 0.00019989224382388813,
-      "loss": 0.9365,
       "step": 230
     },
     {
-      "epoch": 1.154791154791155,
-      "grad_norm": 27.75,
-      "learning_rate": 0.00019984864978915253,
-      "loss": 0.9535,
       "step": 235
     },
     {
-      "epoch": 1.1793611793611793,
-      "grad_norm": 3.03125,
-      "learning_rate": 0.00019979767497577445,
-      "loss": 0.9555,
       "step": 240
     },
     {
-      "epoch": 1.203931203931204,
-      "grad_norm": 3.34375,
-      "learning_rate": 0.000199739323151795,
-      "loss": 0.9599,
       "step": 245
     },
     {
-      "epoch": 1.2285012285012284,
-      "grad_norm": 1.8671875,
-      "learning_rate": 0.00019967359863056134,
-      "loss": 0.9268,
       "step": 250
     },
     {
-      "epoch": 1.253071253071253,
-      "grad_norm": 1.96875,
-      "learning_rate": 0.00019960050627040806,
-      "loss": 0.9183,
       "step": 255
     },
     {
-      "epoch": 1.2776412776412776,
-      "grad_norm": 8.125,
-      "learning_rate": 0.0001995200514742978,
-      "loss": 0.9041,
       "step": 260
     },
     {
-      "epoch": 1.3022113022113022,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.0001994322401894221,
-      "loss": 0.915,
       "step": 265
     },
     {
-      "epoch": 1.3267813267813269,
-      "grad_norm": 0.96484375,
-      "learning_rate": 0.00019933707890676158,
-      "loss": 0.9009,
       "step": 270
     },
     {
-      "epoch": 1.3513513513513513,
-      "grad_norm": 3.515625,
-      "learning_rate": 0.00019923457466060636,
-      "loss": 0.918,
       "step": 275
     },
     {
-      "epoch": 1.375921375921376,
-      "grad_norm": 1.1640625,
-      "learning_rate": 0.00019912473502803582,
-      "loss": 0.9329,
       "step": 280
     },
     {
-      "epoch": 1.4004914004914004,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.0001990075681283587,
-      "loss": 0.9269,
       "step": 285
     },
     {
-      "epoch": 1.425061425061425,
-      "grad_norm": 4.875,
-      "learning_rate": 0.00019888308262251285,
-      "loss": 0.9245,
       "step": 290
     },
     {
-      "epoch": 1.4496314496314495,
-      "grad_norm": 1.3203125,
-      "learning_rate": 0.00019875128771242506,
-      "loss": 0.9252,
       "step": 295
     },
     {
-      "epoch": 1.4742014742014742,
-      "grad_norm": 2.203125,
-      "learning_rate": 0.00019861219314033077,
-      "loss": 0.9055,
       "step": 300
     },
     {
-      "epoch": 1.4987714987714988,
-      "grad_norm": 2.640625,
-      "learning_rate": 0.000198465809188054,
-      "loss": 0.9095,
       "step": 305
     },
     {
-      "epoch": 1.5233415233415233,
-      "grad_norm": 1.7421875,
-      "learning_rate": 0.0001983121466762474,
-      "loss": 0.9061,
       "step": 310
     },
     {
-      "epoch": 1.547911547911548,
-      "grad_norm": 3.671875,
-      "learning_rate": 0.00019815121696359212,
-      "loss": 0.8923,
       "step": 315
     },
     {
-      "epoch": 1.5724815724815726,
-      "grad_norm": 2.875,
-      "learning_rate": 0.00019798303194595846,
-      "loss": 0.9021,
       "step": 320
     },
     {
-      "epoch": 1.597051597051597,
-      "grad_norm": 1.3671875,
-      "learning_rate": 0.00019780760405552645,
-      "loss": 0.8921,
       "step": 325
     },
     {
-      "epoch": 1.6216216216216215,
-      "grad_norm": 1.09375,
-      "learning_rate": 0.00019762494625986677,
-      "loss": 0.8885,
       "step": 330
     },
     {
-      "epoch": 1.6461916461916462,
-      "grad_norm": 2.671875,
-      "learning_rate": 0.00019743507206098233,
-      "loss": 0.8895,
       "step": 335
     },
     {
-      "epoch": 1.6707616707616708,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.00019723799549431007,
-      "loss": 0.8976,
       "step": 340
     },
     {
-      "epoch": 1.6953316953316953,
-      "grad_norm": 1.109375,
-      "learning_rate": 0.00019703373112768365,
-      "loss": 0.8836,
       "step": 345
     },
     {
-      "epoch": 1.71990171990172,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.00019682229406025635,
-      "loss": 0.8801,
       "step": 350
     },
     {
-      "epoch": 1.7444717444717446,
-      "grad_norm": 1.6875,
-      "learning_rate": 0.00019660369992138517,
-      "loss": 0.9048,
       "step": 355
     },
     {
-      "epoch": 1.769041769041769,
-      "grad_norm": 1.4921875,
-      "learning_rate": 0.0001963779648694754,
-      "loss": 0.8906,
       "step": 360
     },
     {
-      "epoch": 1.7936117936117935,
-      "grad_norm": 1.3203125,
-      "learning_rate": 0.00019614510559078625,
-      "loss": 0.8805,
       "step": 365
     },
     {
-      "epoch": 1.8181818181818183,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.00019590513929819734,
-      "loss": 0.8839,
       "step": 370
     },
     {
-      "epoch": 1.8427518427518428,
-      "grad_norm": 1.2109375,
-      "learning_rate": 0.0001956580837299364,
-      "loss": 0.8715,
       "step": 375
     },
     {
-      "epoch": 1.8673218673218672,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.000195403957148268,
-      "loss": 0.8952,
       "step": 380
     },
     {
-      "epoch": 1.8918918918918919,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0001951427783381437,
-      "loss": 0.8585,
       "step": 385
     },
     {
-      "epoch": 1.9164619164619165,
-      "grad_norm": 0.953125,
-      "learning_rate": 0.0001948745666058134,
-      "loss": 0.8868,
       "step": 390
     },
     {
-      "epoch": 1.941031941031941,
-      "grad_norm": 2.734375,
-      "learning_rate": 0.00019459934177739813,
-      "loss": 0.8778,
       "step": 395
     },
     {
-      "epoch": 1.9656019656019657,
-      "grad_norm": 1.8359375,
-      "learning_rate": 0.00019431712419742484,
-      "loss": 0.8902,
       "step": 400
     },
     {
-      "epoch": 1.9901719901719903,
-      "grad_norm": 3.078125,
-      "learning_rate": 0.00019402793472732217,
-      "loss": 0.8685,
       "step": 405
     },
     {
-      "epoch": 2.0,
-      "eval_loss": 2.4260478019714355,
-      "eval_runtime": 2.0431,
-      "eval_samples_per_second": 4.895,
-      "eval_steps_per_second": 0.979,
-      "step": 407
-    },
-    {
-      "epoch": 2.0147420147420148,
-      "grad_norm": 1.359375,
-      "learning_rate": 0.00019373179474387858,
-      "loss": 0.8187,
       "step": 410
     },
     {
-      "epoch": 2.039312039312039,
-      "grad_norm": 0.9140625,
-      "learning_rate": 0.0001934287261376622,
-      "loss": 0.7808,
       "step": 415
     },
     {
-      "epoch": 2.063882063882064,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.00019311875131140246,
-      "loss": 0.7746,
       "step": 420
     },
     {
-      "epoch": 2.0884520884520885,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.00019280189317833445,
-      "loss": 0.7761,
       "step": 425
     },
     {
-      "epoch": 2.113022113022113,
-      "grad_norm": 1.5859375,
-      "learning_rate": 0.00019247817516050483,
-      "loss": 0.7781,
       "step": 430
     },
     {
-      "epoch": 2.1375921375921374,
-      "grad_norm": 0.93359375,
-      "learning_rate": 0.00019214762118704076,
-      "loss": 0.7648,
       "step": 435
     },
     {
-      "epoch": 2.1621621621621623,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.0001918102556923809,
-      "loss": 0.7926,
       "step": 440
     },
     {
-      "epoch": 2.1867321867321867,
-      "grad_norm": 2.0625,
-      "learning_rate": 0.0001914661036144692,
-      "loss": 0.7821,
       "step": 445
     },
     {
-      "epoch": 2.211302211302211,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.00019111519039291167,
-      "loss": 0.7807,
       "step": 450
     },
     {
-      "epoch": 2.235872235872236,
-      "grad_norm": 1.484375,
-      "learning_rate": 0.00019075754196709572,
-      "loss": 0.7994,
       "step": 455
     },
     {
-      "epoch": 2.2604422604422605,
-      "grad_norm": 1.0625,
-      "learning_rate": 0.0001903931847742728,
-      "loss": 0.7777,
       "step": 460
     },
     {
-      "epoch": 2.285012285012285,
-      "grad_norm": 1.8203125,
-      "learning_rate": 0.00019002214574760423,
-      "loss": 0.7781,
       "step": 465
     },
     {
-      "epoch": 2.30958230958231,
-      "grad_norm": 1.484375,
-      "learning_rate": 0.0001896444523141701,
-      "loss": 0.7977,
       "step": 470
     },
     {
-      "epoch": 2.3341523341523343,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.00018926013239294216,
-      "loss": 0.7758,
       "step": 475
     },
     {
-      "epoch": 2.3587223587223587,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.00018886921439271984,
-      "loss": 0.8019,
       "step": 480
     },
     {
-      "epoch": 2.383292383292383,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.00018847172721003043,
-      "loss": 0.7829,
       "step": 485
     },
     {
-      "epoch": 2.407862407862408,
-      "grad_norm": 0.984375,
-      "learning_rate": 0.00018806770022699278,
-      "loss": 0.7759,
       "step": 490
     },
     {
-      "epoch": 2.4324324324324325,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.0001876571633091458,
-      "loss": 0.776,
       "step": 495
     },
     {
-      "epoch": 2.457002457002457,
-      "grad_norm": 1.1875,
-      "learning_rate": 0.00018724014680324057,
-      "loss": 0.7885,
       "step": 500
     },
     {
-      "epoch": 2.4815724815724813,
-      "grad_norm": 0.9375,
-      "learning_rate": 0.00018681668153499697,
-      "loss": 0.7929,
       "step": 505
     },
     {
-      "epoch": 2.506142506142506,
-      "grad_norm": 4.25,
-      "learning_rate": 0.00018638679880682543,
-      "loss": 0.7835,
       "step": 510
     },
     {
-      "epoch": 2.5307125307125307,
-      "grad_norm": 1.109375,
-      "learning_rate": 0.00018595053039551274,
-      "loss": 0.7981,
       "step": 515
     },
     {
-      "epoch": 2.555282555282555,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.00018550790854987323,
-      "loss": 0.7831,
       "step": 520
     },
     {
-      "epoch": 2.57985257985258,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.00018505896598836508,
-      "loss": 0.7863,
       "step": 525
     },
     {
-      "epoch": 2.6044226044226044,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.00018460373589667154,
-      "loss": 0.7929,
       "step": 530
     },
     {
-      "epoch": 2.628992628992629,
-      "grad_norm": 1.1015625,
-      "learning_rate": 0.00018414225192524806,
-      "loss": 0.7722,
       "step": 535
     },
     {
-      "epoch": 2.6535626535626538,
-      "grad_norm": 1.390625,
-      "learning_rate": 0.00018367454818683473,
-      "loss": 0.7832,
       "step": 540
     },
     {
-      "epoch": 2.678132678132678,
-      "grad_norm": 1.2890625,
-      "learning_rate": 0.00018320065925393468,
-      "loss": 0.7746,
       "step": 545
     },
     {
-      "epoch": 2.7027027027027026,
-      "grad_norm": 1.59375,
-      "learning_rate": 0.00018272062015625872,
-      "loss": 0.7826,
       "step": 550
     },
     {
-      "epoch": 2.7272727272727275,
-      "grad_norm": 1.328125,
-      "learning_rate": 0.0001822344663781356,
-      "loss": 0.7946,
-      "step": 555
-    },
-    {
-      "epoch": 2.751842751842752,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.00018174223385588917,
-      "loss": 0.7849,
-      "step": 560
-    },
-    {
-      "epoch": 2.7764127764127764,
-      "grad_norm": 10.3125,
-      "learning_rate": 0.00018124395897518224,
-      "loss": 0.7783,
-      "step": 565
-    },
-    {
-      "epoch": 2.800982800982801,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.0001807396785683264,
-      "loss": 0.8005,
-      "step": 570
-    },
-    {
-      "epoch": 2.8255528255528253,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.00018022942991156,
-      "loss": 0.7981,
-      "step": 575
-    },
-    {
-      "epoch": 2.85012285012285,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.00017971325072229226,
-      "loss": 0.7887,
-      "step": 580
-    },
-    {
-      "epoch": 2.8746928746928746,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0001791911791563154,
-      "loss": 0.8013,
-      "step": 585
-    },
-    {
-      "epoch": 2.899262899262899,
-      "grad_norm": 1.2578125,
-      "learning_rate": 0.00017866325380498416,
-      "loss": 0.777,
-      "step": 590
-    },
-    {
-      "epoch": 2.923832923832924,
-      "grad_norm": 1.03125,
-      "learning_rate": 0.00017812951369236316,
-      "loss": 0.7992,
-      "step": 595
-    },
-    {
-      "epoch": 2.9484029484029484,
-      "grad_norm": 1.640625,
-      "learning_rate": 0.00017758999827234212,
-      "loss": 0.7856,
-      "step": 600
-    },
-    {
-      "epoch": 2.972972972972973,
-      "grad_norm": 3.109375,
-      "learning_rate": 0.00017704474742571969,
-      "loss": 0.787,
-      "step": 605
-    },
-    {
-      "epoch": 2.9975429975429977,
-      "grad_norm": 3.8125,
-      "learning_rate": 0.00017649380145725517,
-      "loss": 0.8,
-      "step": 610
-    },
-    {
-      "epoch": 2.9975429975429977,
-      "eval_loss": 2.5051403045654297,
-      "eval_runtime": 2.0509,
-      "eval_samples_per_second": 4.876,
-      "eval_steps_per_second": 0.975,
-      "step": 610
-    },
-    {
-      "epoch": 3.022113022113022,
-      "grad_norm": 1.7890625,
-      "learning_rate": 0.00017593720109268944,
-      "loss": 0.6916,
-      "step": 615
-    },
-    {
-      "epoch": 3.0466830466830466,
-      "grad_norm": 9.4375,
-      "learning_rate": 0.00017537498747573443,
-      "loss": 0.6614,
-      "step": 620
-    },
-    {
-      "epoch": 3.0712530712530715,
-      "grad_norm": 0.96484375,
-      "learning_rate": 0.00017480720216503183,
-      "loss": 0.6639,
-      "step": 625
-    },
-    {
-      "epoch": 3.095823095823096,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.000174233887131081,
-      "loss": 0.6579,
-      "step": 630
-    },
-    {
-      "epoch": 3.1203931203931203,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.0001736550847531366,
-      "loss": 0.6546,
-      "step": 635
-    },
-    {
-      "epoch": 3.1449631449631448,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.00017307083781607595,
-      "loss": 0.6731,
-      "step": 640
-    },
-    {
-      "epoch": 3.1695331695331697,
-      "grad_norm": 0.87890625,
-      "learning_rate": 0.00017248118950723634,
-      "loss": 0.6761,
-      "step": 645
-    },
-    {
-      "epoch": 3.194103194103194,
-      "grad_norm": 0.98828125,
-      "learning_rate": 0.00017188618341322254,
-      "loss": 0.6761,
-      "step": 650
-    },
-    {
-      "epoch": 3.2186732186732185,
-      "grad_norm": 1.7421875,
-      "learning_rate": 0.00017128586351668524,
-      "loss": 0.666,
-      "step": 655
-    },
-    {
-      "epoch": 3.2432432432432434,
-      "grad_norm": 1.3515625,
-      "learning_rate": 0.00017068027419306936,
-      "loss": 0.6677,
-      "step": 660
-    },
-    {
-      "epoch": 3.267813267813268,
-      "grad_norm": 0.9609375,
-      "learning_rate": 0.00017006946020733425,
-      "loss": 0.6663,
-      "step": 665
-    },
-    {
-      "epoch": 3.2923832923832923,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.00016945346671064452,
-      "loss": 0.6762,
-      "step": 670
-    },
-    {
-      "epoch": 3.3169533169533167,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.00016883233923703248,
-      "loss": 0.6842,
-      "step": 675
-    },
-    {
-      "epoch": 3.3415233415233416,
-      "grad_norm": 1.359375,
-      "learning_rate": 0.00016820612370003221,
-      "loss": 0.6756,
-      "step": 680
-    },
-    {
-      "epoch": 3.366093366093366,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.00016757486638928587,
-      "loss": 0.6757,
-      "step": 685
-    },
-    {
-      "epoch": 3.3906633906633905,
-      "grad_norm": 1.5390625,
-      "learning_rate": 0.00016693861396712168,
-      "loss": 0.6971,
-      "step": 690
-    },
-    {
-      "epoch": 3.4152334152334154,
-      "grad_norm": 1.125,
-      "learning_rate": 0.00016629741346510496,
-      "loss": 0.6837,
-      "step": 695
-    },
-    {
-      "epoch": 3.43980343980344,
-      "grad_norm": 2.6875,
-      "learning_rate": 0.00016565131228056133,
-      "loss": 0.6836,
-      "step": 700
-    },
-    {
-      "epoch": 3.4643734643734643,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.00016500035817307334,
-      "loss": 0.6719,
-      "step": 705
-    },
-    {
-      "epoch": 3.488943488943489,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0001643445992609498,
-      "loss": 0.6722,
-      "step": 710
-    },
-    {
-      "epoch": 3.5135135135135136,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.00016368408401766916,
-      "loss": 0.6843,
-      "step": 715
-    },
-    {
-      "epoch": 3.538083538083538,
-      "grad_norm": 0.85546875,
-      "learning_rate": 0.0001630188612682963,
-      "loss": 0.6787,
-      "step": 720
-    },
-    {
-      "epoch": 3.562653562653563,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.00016234898018587337,
-      "loss": 0.6639,
-      "step": 725
-    },
-    {
-      "epoch": 3.5872235872235874,
-      "grad_norm": 0.96484375,
-      "learning_rate": 0.00016167449028778484,
-      "loss": 0.6951,
-      "step": 730
-    },
-    {
-      "epoch": 3.611793611793612,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.0001609954414320973,
-      "loss": 0.6881,
-      "step": 735
-    },
-    {
-      "epoch": 3.6363636363636362,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0001603118838138741,
-      "loss": 0.6761,
-      "step": 740
-    },
-    {
-      "epoch": 3.6609336609336607,
-      "grad_norm": 0.9609375,
-      "learning_rate": 0.00015962386796146462,
-      "loss": 0.69,
-      "step": 745
-    },
-    {
-      "epoch": 3.6855036855036856,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.00015893144473276953,
-      "loss": 0.691,
-      "step": 750
-    },
-    {
-      "epoch": 3.71007371007371,
-      "grad_norm": 1.03125,
-      "learning_rate": 0.00015823466531148124,
-      "loss": 0.6843,
-      "step": 755
-    },
-    {
-      "epoch": 3.7346437346437344,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.00015753358120330042,
-      "loss": 0.7094,
-      "step": 760
-    },
-    {
-      "epoch": 3.7592137592137593,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.00015682824423212877,
-      "loss": 0.6892,
-      "step": 765
-    },
-    {
-      "epoch": 3.7837837837837838,
-      "grad_norm": 0.9296875,
-      "learning_rate": 0.00015611870653623825,
-      "loss": 0.6929,
-      "step": 770
-    },
-    {
-      "epoch": 3.808353808353808,
-      "grad_norm": 1.53125,
-      "learning_rate": 0.00015540502056441688,
-      "loss": 0.7022,
-      "step": 775
-    },
-    {
-      "epoch": 3.832923832923833,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.00015468723907209193,
-      "loss": 0.703,
-      "step": 780
-    },
-    {
-      "epoch": 3.8574938574938575,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.00015396541511743012,
-      "loss": 0.7027,
-      "step": 785
-    },
-    {
-      "epoch": 3.882063882063882,
-      "grad_norm": 0.98046875,
-      "learning_rate": 0.00015323960205741561,
-      "loss": 0.6829,
-      "step": 790
-    },
-    {
-      "epoch": 3.906633906633907,
-      "grad_norm": 0.921875,
-      "learning_rate": 0.00015250985354390596,
-      "loss": 0.6945,
-      "step": 795
-    },
-    {
-      "epoch": 3.9312039312039313,
-      "grad_norm": 0.98046875,
-      "learning_rate": 0.0001517762235196661,
-      "loss": 0.6966,
-      "step": 800
-    },
-    {
-      "epoch": 3.9557739557739557,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.00015103876621438086,
-      "loss": 0.6947,
-      "step": 805
-    },
-    {
-      "epoch": 3.98034398034398,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.00015029753614064645,
-      "loss": 0.6938,
-      "step": 810
-    },
-    {
-      "epoch": 4.0,
-      "eval_loss": 2.6558120250701904,
-      "eval_runtime": 2.0446,
-      "eval_samples_per_second": 4.891,
-      "eval_steps_per_second": 0.978,
-      "step": 814
-    },
-    {
-      "epoch": 4.004914004914005,
-      "grad_norm": 1.5703125,
-      "learning_rate": 0.00014955258808994096,
-      "loss": 0.6676,
-      "step": 815
-    },
-    {
-      "epoch": 4.0294840294840295,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.00014880397712857386,
-      "loss": 0.5659,
-      "step": 820
-    },
-    {
-      "epoch": 4.054054054054054,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.00014805175859361594,
-      "loss": 0.5525,
-      "step": 825
-    },
-    {
-      "epoch": 4.078624078624078,
-      "grad_norm": 1.328125,
-      "learning_rate": 0.00014729598808880861,
-      "loss": 0.5546,
-      "step": 830
-    },
-    {
-      "epoch": 4.103194103194103,
-      "grad_norm": 0.96875,
-      "learning_rate": 0.00014653672148045357,
-      "loss": 0.5665,
-      "step": 835
-    },
-    {
-      "epoch": 4.127764127764128,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.00014577401489328335,
-      "loss": 0.565,
-      "step": 840
-    },
-    {
-      "epoch": 4.152334152334152,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.0001450079247063127,
-      "loss": 0.5764,
-      "step": 845
-    },
-    {
-      "epoch": 4.176904176904177,
-      "grad_norm": 0.9375,
-      "learning_rate": 0.00014423850754867075,
-      "loss": 0.5565,
-      "step": 850
-    },
-    {
-      "epoch": 4.201474201474202,
-      "grad_norm": 0.921875,
-      "learning_rate": 0.0001434658202954153,
-      "loss": 0.5572,
-      "step": 855
-    },
-    {
-      "epoch": 4.226044226044226,
-      "grad_norm": 2.0,
-      "learning_rate": 0.00014268992006332846,
-      "loss": 0.5719,
-      "step": 860
-    },
-    {
-      "epoch": 4.250614250614251,
-      "grad_norm": 0.98828125,
-      "learning_rate": 0.0001419108642066947,
-      "loss": 0.5644,
-      "step": 865
-    },
-    {
-      "epoch": 4.275184275184275,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.00014112871031306119,
-      "loss": 0.5785,
-      "step": 870
-    },
-    {
-      "epoch": 4.2997542997543,
-      "grad_norm": 1.734375,
-      "learning_rate": 0.00014034351619898088,
-      "loss": 0.5825,
-      "step": 875
-    },
-    {
-      "epoch": 4.324324324324325,
-      "grad_norm": 1.1171875,
-      "learning_rate": 0.00013955533990573886,
-      "loss": 0.5752,
-      "step": 880
-    },
-    {
-      "epoch": 4.348894348894349,
-      "grad_norm": 1.1484375,
-      "learning_rate": 0.00013876423969506194,
-      "loss": 0.5863,
-      "step": 885
-    },
-    {
-      "epoch": 4.3734643734643734,
-      "grad_norm": 1.7421875,
-      "learning_rate": 0.00013797027404481184,
-      "loss": 0.5826,
-      "step": 890
-    },
-    {
-      "epoch": 4.398034398034398,
-      "grad_norm": 0.99609375,
-      "learning_rate": 0.0001371735016446627,
-      "loss": 0.576,
-      "step": 895
-    },
-    {
-      "epoch": 4.422604422604422,
-      "grad_norm": 0.875,
-      "learning_rate": 0.00013637398139176255,
-      "loss": 0.577,
-      "step": 900
-    },
-    {
-      "epoch": 4.447174447174447,
-      "grad_norm": 0.9375,
-      "learning_rate": 0.00013557177238637986,
-      "loss": 0.5832,
-      "step": 905
-    },
-    {
-      "epoch": 4.471744471744472,
-      "grad_norm": 0.9375,
-      "learning_rate": 0.00013476693392753476,
-      "loss": 0.5856,
-      "step": 910
-    },
-    {
-      "epoch": 4.496314496314496,
-      "grad_norm": 1.1328125,
-      "learning_rate": 0.00013395952550861572,
-      "loss": 0.592,
-      "step": 915
-    },
-    {
-      "epoch": 4.520884520884521,
-      "grad_norm": 0.97265625,
-      "learning_rate": 0.00013314960681298175,
-      "loss": 0.5861,
-      "step": 920
-    },
-    {
-      "epoch": 4.545454545454545,
-      "grad_norm": 0.97265625,
-      "learning_rate": 0.0001323372377095507,
-      "loss": 0.5814,
-      "step": 925
-    },
-    {
-      "epoch": 4.57002457002457,
-      "grad_norm": 0.91796875,
-      "learning_rate": 0.0001315224782483737,
-      "loss": 0.5847,
-      "step": 930
-    },
-    {
-      "epoch": 4.594594594594595,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.00013070538865619642,
-      "loss": 0.5773,
-      "step": 935
-    },
-    {
-      "epoch": 4.61916461916462,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.00012988602933200689,
-      "loss": 0.5723,
-      "step": 940
-    },
-    {
-      "epoch": 4.643734643734644,
-      "grad_norm": 1.1171875,
-      "learning_rate": 0.0001290644608425711,
-      "loss": 0.5918,
-      "step": 945
-    },
-    {
-      "epoch": 4.6683046683046685,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.0001282407439179557,
-      "loss": 0.5932,
-      "step": 950
-    },
-    {
-      "epoch": 4.6928746928746925,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.00012741493944703905,
-      "loss": 0.5868,
-      "step": 955
-    },
-    {
-      "epoch": 4.717444717444717,
-      "grad_norm": 0.83203125,
-      "learning_rate": 0.0001265871084730101,
-      "loss": 0.5944,
-      "step": 960
-    },
-    {
-      "epoch": 4.742014742014742,
-      "grad_norm": 0.9765625,
-      "learning_rate": 0.00012575731218885625,
-      "loss": 0.5893,
-      "step": 965
-    },
-    {
-      "epoch": 4.766584766584766,
-      "grad_norm": 1.140625,
-      "learning_rate": 0.00012492561193284008,
-      "loss": 0.5791,
-      "step": 970
-    },
-    {
-      "epoch": 4.791154791154791,
-      "grad_norm": 1.1484375,
-      "learning_rate": 0.00012409206918396503,
-      "loss": 0.5876,
-      "step": 975
-    },
-    {
-      "epoch": 4.815724815724816,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.00012325674555743106,
-      "loss": 0.5871,
-      "step": 980
-    },
-    {
-      "epoch": 4.84029484029484,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.0001224197028000799,
-      "loss": 0.5987,
-      "step": 985
-    },
-    {
-      "epoch": 4.864864864864865,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.000121581002785831,
-      "loss": 0.5857,
-      "step": 990
-    },
-    {
-      "epoch": 4.88943488943489,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.00012074070751110751,
-      "loss": 0.5932,
-      "step": 995
-    },
-    {
-      "epoch": 4.914004914004914,
-      "grad_norm": 1.0625,
-      "learning_rate": 0.00011989887909025388,
-      "loss": 0.5891,
-      "step": 1000
-    },
-    {
-      "epoch": 4.938574938574939,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.00011905557975094406,
-      "loss": 0.5881,
-      "step": 1005
-    },
-    {
-      "epoch": 4.963144963144963,
-      "grad_norm": 0.87890625,
-      "learning_rate": 0.00011821087182958186,
-      "loss": 0.5845,
-      "step": 1010
-    },
-    {
-      "epoch": 4.987714987714988,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.00011736481776669306,
-      "loss": 0.5865,
-      "step": 1015
-    },
-    {
-      "epoch": 4.997542997542998,
-      "eval_loss": 2.9162657260894775,
-      "eval_runtime": 2.0535,
-      "eval_samples_per_second": 4.87,
-      "eval_steps_per_second": 0.974,
-      "step": 1017
-    },
-    {
-      "epoch": 5.012285012285012,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0001165174801023096,
-      "loss": 0.5388,
-      "step": 1020
-    },
-    {
-      "epoch": 5.036855036855036,
-      "grad_norm": 0.953125,
-      "learning_rate": 0.00011566892147134705,
-      "loss": 0.4596,
-      "step": 1025
-    },
-    {
-      "epoch": 5.061425061425061,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.00011481920459897417,
-      "loss": 0.4681,
-      "step": 1030
-    },
-    {
-      "epoch": 5.085995085995086,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.00011396839229597674,
-      "loss": 0.4716,
-      "step": 1035
-    },
-    {
-      "epoch": 5.11056511056511,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.00011311654745411425,
-      "loss": 0.4667,
-      "step": 1040
-    },
-    {
-      "epoch": 5.135135135135135,
-      "grad_norm": 0.90625,
-      "learning_rate": 0.00011226373304147123,
-      "loss": 0.465,
-      "step": 1045
-    },
-    {
-      "epoch": 5.15970515970516,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.00011141001209780249,
-      "loss": 0.4751,
-      "step": 1050
-    },
-    {
-      "epoch": 5.184275184275184,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.00011055544772987335,
-      "loss": 0.4622,
-      "step": 1055
-    },
-    {
-      "epoch": 5.208845208845209,
-      "grad_norm": 1.234375,
-      "learning_rate": 0.0001097001031067947,
-      "loss": 0.4839,
-      "step": 1060
-    },
-    {
-      "epoch": 5.233415233415234,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.00010884404145535372,
-      "loss": 0.4679,
-      "step": 1065
-    },
-    {
-      "epoch": 5.257985257985258,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.00010798732605534006,
-      "loss": 0.4764,
-      "step": 1070
-    },
-    {
-      "epoch": 5.282555282555283,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.00010713002023486816,
-      "loss": 0.4834,
-      "step": 1075
-    },
-    {
-      "epoch": 5.3071253071253075,
-      "grad_norm": 0.98046875,
-      "learning_rate": 0.00010627218736569624,
-      "loss": 0.4853,
-      "step": 1080
-    },
-    {
-      "epoch": 5.3316953316953315,
-      "grad_norm": 0.875,
-      "learning_rate": 0.00010541389085854176,
-      "loss": 0.4823,
-      "step": 1085
-    },
-    {
-      "epoch": 5.356265356265356,
-      "grad_norm": 1.03125,
-      "learning_rate": 0.00010455519415839415,
-      "loss": 0.4838,
-      "step": 1090
-    },
-    {
-      "epoch": 5.38083538083538,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.00010369616073982491,
-      "loss": 0.4881,
-      "step": 1095
-    },
-    {
-      "epoch": 5.405405405405405,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.00010283685410229571,
-      "loss": 0.485,
-      "step": 1100
-    },
-    {
-      "epoch": 5.42997542997543,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.00010197733776546447,
-      "loss": 0.4854,
-      "step": 1105
-    },
-    {
-      "epoch": 5.454545454545454,
-      "grad_norm": 0.9609375,
-      "learning_rate": 0.00010111767526449004,
-      "loss": 0.4851,
-      "step": 1110
-    },
-    {
-      "epoch": 5.479115479115479,
-      "grad_norm": 1.265625,
-      "learning_rate": 0.00010025793014533558,
-      "loss": 0.4889,
-      "step": 1115
-    },
-    {
-      "epoch": 5.503685503685504,
-      "grad_norm": 0.87890625,
-      "learning_rate": 9.939816596007146e-05,
-      "loss": 0.4772,
-      "step": 1120
-    },
-    {
-      "epoch": 5.528255528255528,
-      "grad_norm": 1.1640625,
-      "learning_rate": 9.853844626217737e-05,
-      "loss": 0.4857,
-      "step": 1125
-    },
-    {
-      "epoch": 5.552825552825553,
-      "grad_norm": 1.0703125,
-      "learning_rate": 9.767883460184443e-05,
-      "loss": 0.4869,
-      "step": 1130
-    },
-    {
-      "epoch": 5.577395577395578,
-      "grad_norm": 0.88671875,
-      "learning_rate": 9.681939452127784e-05,
-      "loss": 0.4806,
-      "step": 1135
-    },
-    {
-      "epoch": 5.601965601965602,
-      "grad_norm": 1.2421875,
-      "learning_rate": 9.596018954999953e-05,
-      "loss": 0.4897,
-      "step": 1140
-    },
-    {
-      "epoch": 5.6265356265356266,
-      "grad_norm": 1.015625,
-      "learning_rate": 9.510128320015224e-05,
-      "loss": 0.4749,
-      "step": 1145
-    },
-    {
-      "epoch": 5.651105651105651,
-      "grad_norm": 0.93359375,
-      "learning_rate": 9.424273896180482e-05,
-      "loss": 0.4837,
-      "step": 1150
-    },
-    {
-      "epoch": 5.675675675675675,
-      "grad_norm": 0.8984375,
-      "learning_rate": 9.338462029825886e-05,
-      "loss": 0.4928,
-      "step": 1155
-    },
-    {
-      "epoch": 5.7002457002457,
-      "grad_norm": 0.97265625,
-      "learning_rate": 9.252699064135758e-05,
-      "loss": 0.477,
-      "step": 1160
-    },
-    {
-      "epoch": 5.724815724815725,
-      "grad_norm": 0.87890625,
-      "learning_rate": 9.166991338679715e-05,
-      "loss": 0.4882,
-      "step": 1165
-    },
-    {
-      "epoch": 5.749385749385749,
-      "grad_norm": 1.078125,
-      "learning_rate": 9.081345188944019e-05,
-      "loss": 0.4902,
-      "step": 1170
-    },
-    {
-      "epoch": 5.773955773955774,
-      "grad_norm": 0.97265625,
-      "learning_rate": 8.995766945863277e-05,
-      "loss": 0.4873,
-      "step": 1175
-    },
-    {
-      "epoch": 5.798525798525798,
-      "grad_norm": 1.0625,
-      "learning_rate": 8.91026293535247e-05,
-      "loss": 0.4864,
-      "step": 1180
-    },
-    {
-      "epoch": 5.823095823095823,
-      "grad_norm": 1.1953125,
-      "learning_rate": 8.82483947783932e-05,
-      "loss": 0.4782,
-      "step": 1185
-    },
-    {
-      "epoch": 5.847665847665848,
-      "grad_norm": 0.921875,
-      "learning_rate": 8.739502887797107e-05,
-      "loss": 0.4918,
-      "step": 1190
-    },
-    {
-      "epoch": 5.872235872235873,
-      "grad_norm": 0.9296875,
-      "learning_rate": 8.654259473277892e-05,
-      "loss": 0.4955,
-      "step": 1195
-    },
-    {
-      "epoch": 5.896805896805897,
-      "grad_norm": 1.015625,
-      "learning_rate": 8.569115535446228e-05,
-      "loss": 0.4802,
-      "step": 1200
-    },
-    {
-      "epoch": 5.921375921375922,
-      "grad_norm": 0.8984375,
-      "learning_rate": 8.484077368113399e-05,
-      "loss": 0.4832,
-      "step": 1205
-    },
-    {
-      "epoch": 5.945945945945946,
-      "grad_norm": 0.87890625,
-      "learning_rate": 8.399151257272156e-05,
-      "loss": 0.4847,
-      "step": 1210
-    },
-    {
-      "epoch": 5.9705159705159705,
-      "grad_norm": 0.87890625,
-      "learning_rate": 8.314343480632078e-05,
-      "loss": 0.48,
-      "step": 1215
-    },
-    {
-      "epoch": 5.995085995085995,
-      "grad_norm": 0.87890625,
-      "learning_rate": 8.229660307155518e-05,
-      "loss": 0.4821,
-      "step": 1220
-    },
-    {
-      "epoch": 6.0,
-      "eval_loss": 3.3285133838653564,
-      "eval_runtime": 2.0506,
-      "eval_samples_per_second": 4.877,
-      "eval_steps_per_second": 0.975,
-      "step": 1221
-    },
-    {
-      "epoch": 6.019656019656019,
-      "grad_norm": 0.80078125,
-      "learning_rate": 8.145107996594206e-05,
-      "loss": 0.4087,
-      "step": 1225
-    },
-    {
-      "epoch": 6.044226044226044,
-      "grad_norm": 0.98046875,
-      "learning_rate": 8.060692799026522e-05,
-      "loss": 0.3843,
-      "step": 1230
-    },
-    {
-      "epoch": 6.068796068796069,
-      "grad_norm": 0.828125,
-      "learning_rate": 7.976420954395518e-05,
-      "loss": 0.3844,
-      "step": 1235
-    },
-    {
-      "epoch": 6.093366093366093,
-      "grad_norm": 0.984375,
-      "learning_rate": 7.892298692047621e-05,
-      "loss": 0.3909,
-      "step": 1240
-    },
-    {
-      "epoch": 6.117936117936118,
-      "grad_norm": 1.1171875,
-      "learning_rate": 7.808332230272209e-05,
-      "loss": 0.393,
-      "step": 1245
-    },
-    {
-      "epoch": 6.142506142506143,
-      "grad_norm": 0.9453125,
-      "learning_rate": 7.724527775841914e-05,
-      "loss": 0.3818,
-      "step": 1250
-    },
-    {
-      "epoch": 6.167076167076167,
-      "grad_norm": 0.921875,
-      "learning_rate": 7.64089152355385e-05,
-      "loss": 0.3938,
-      "step": 1255
-    },
-    {
-      "epoch": 6.191646191646192,
-      "grad_norm": 0.90234375,
-      "learning_rate": 7.55742965577169e-05,
-      "loss": 0.3885,
-      "step": 1260
-    },
-    {
-      "epoch": 6.216216216216216,
-      "grad_norm": 0.94921875,
-      "learning_rate": 7.474148341968652e-05,
-      "loss": 0.3889,
-      "step": 1265
-    },
-    {
-      "epoch": 6.240786240786241,
-      "grad_norm": 1.171875,
-      "learning_rate": 7.391053738271466e-05,
-      "loss": 0.3932,
-      "step": 1270
-    },
-    {
-      "epoch": 6.2653562653562656,
-      "grad_norm": 0.91015625,
-      "learning_rate": 7.308151987005326e-05,
-      "loss": 0.3823,
-      "step": 1275
-    },
-    {
-      "epoch": 6.2899262899262895,
-      "grad_norm": 0.91015625,
-      "learning_rate": 7.225449216239821e-05,
-      "loss": 0.3857,
-      "step": 1280
-    },
-    {
-      "epoch": 6.314496314496314,
-      "grad_norm": 0.8671875,
-      "learning_rate": 7.142951539335981e-05,
-      "loss": 0.3973,
-      "step": 1285
-    },
-    {
-      "epoch": 6.339066339066339,
-      "grad_norm": 0.91796875,
-      "learning_rate": 7.060665054494362e-05,
-      "loss": 0.3901,
-      "step": 1290
-    },
-    {
-      "epoch": 6.363636363636363,
-      "grad_norm": 0.89453125,
-      "learning_rate": 6.978595844304271e-05,
-      "loss": 0.386,
-      "step": 1295
-    },
-    {
-      "epoch": 6.388206388206388,
-      "grad_norm": 1.03125,
-      "learning_rate": 6.89674997529416e-05,
-      "loss": 0.3881,
-      "step": 1300
-    },
-    {
-      "epoch": 6.412776412776413,
-      "grad_norm": 0.89453125,
-      "learning_rate": 6.815133497483157e-05,
-      "loss": 0.4006,
-      "step": 1305
-    },
-    {
-      "epoch": 6.437346437346437,
-      "grad_norm": 1.0234375,
-      "learning_rate": 6.733752443933878e-05,
-      "loss": 0.397,
-      "step": 1310
-    },
-    {
-      "epoch": 6.461916461916462,
-      "grad_norm": 1.0703125,
-      "learning_rate": 6.65261283030646e-05,
-      "loss": 0.3972,
-      "step": 1315
-    },
-    {
-      "epoch": 6.486486486486487,
-      "grad_norm": 1.0390625,
-      "learning_rate": 6.571720654413877e-05,
-      "loss": 0.3965,
-      "step": 1320
-    },
-    {
-      "epoch": 6.511056511056511,
-      "grad_norm": 0.9296875,
-      "learning_rate": 6.491081895778588e-05,
-      "loss": 0.3961,
-      "step": 1325
-    },
-    {
-      "epoch": 6.535626535626536,
-      "grad_norm": 1.0625,
-      "learning_rate": 6.410702515190543e-05,
-      "loss": 0.3919,
-      "step": 1330
-    },
-    {
-      "epoch": 6.560196560196561,
-      "grad_norm": 0.9453125,
-      "learning_rate": 6.330588454266542e-05,
-      "loss": 0.3916,
-      "step": 1335
-    },
-    {
-      "epoch": 6.584766584766585,
-      "grad_norm": 0.9453125,
-      "learning_rate": 6.250745635011048e-05,
-      "loss": 0.4089,
-      "step": 1340
-    },
-    {
-      "epoch": 6.6093366093366095,
-      "grad_norm": 0.9140625,
-      "learning_rate": 6.171179959378437e-05,
-      "loss": 0.401,
-      "step": 1345
-    },
-    {
-      "epoch": 6.6339066339066335,
-      "grad_norm": 0.89453125,
-      "learning_rate": 6.0918973088367116e-05,
-      "loss": 0.3927,
-      "step": 1350
-    },
-    {
-      "epoch": 6.658476658476658,
-      "grad_norm": 1.0,
-      "learning_rate": 6.012903543932766e-05,
-      "loss": 0.3899,
-      "step": 1355
-    },
-    {
-      "epoch": 6.683046683046683,
-      "grad_norm": 0.84765625,
-      "learning_rate": 5.934204503859158e-05,
-      "loss": 0.3952,
-      "step": 1360
-    },
-    {
-      "epoch": 6.707616707616707,
-      "grad_norm": 0.98828125,
-      "learning_rate": 5.8558060060224817e-05,
-      "loss": 0.3917,
-      "step": 1365
-    },
-    {
-      "epoch": 6.732186732186732,
-      "grad_norm": 0.92578125,
-      "learning_rate": 5.777713845613364e-05,
-      "loss": 0.3878,
-      "step": 1370
-    },
-    {
-      "epoch": 6.756756756756757,
-      "grad_norm": 0.95703125,
-      "learning_rate": 5.699933795178052e-05,
-      "loss": 0.4066,
-      "step": 1375
-    },
-    {
-      "epoch": 6.781326781326781,
-      "grad_norm": 0.90625,
-      "learning_rate": 5.622471604191746e-05,
-      "loss": 0.3993,
-      "step": 1380
-    },
-    {
-      "epoch": 6.805896805896806,
-      "grad_norm": 0.98828125,
-      "learning_rate": 5.545332998633572e-05,
-      "loss": 0.3975,
-      "step": 1385
-    },
-    {
-      "epoch": 6.830466830466831,
-      "grad_norm": 0.96875,
-      "learning_rate": 5.46852368056334e-05,
-      "loss": 0.4003,
-      "step": 1390
-    },
-    {
-      "epoch": 6.855036855036855,
-      "grad_norm": 0.828125,
-      "learning_rate": 5.392049327700026e-05,
-      "loss": 0.3978,
-      "step": 1395
-    },
-    {
-      "epoch": 6.87960687960688,
-      "grad_norm": 0.90625,
-      "learning_rate": 5.3159155930021e-05,
-      "loss": 0.4017,
-      "step": 1400
-    },
-    {
-      "epoch": 6.9041769041769046,
-      "grad_norm": 0.90625,
-      "learning_rate": 5.2401281042496494e-05,
-      "loss": 0.3992,
-      "step": 1405
-    },
-    {
-      "epoch": 6.9287469287469285,
-      "grad_norm": 0.95703125,
-      "learning_rate": 5.164692463628378e-05,
-      "loss": 0.3965,
-      "step": 1410
-    },
-    {
-      "epoch": 6.953316953316953,
-      "grad_norm": 0.9140625,
-      "learning_rate": 5.0896142473154987e-05,
-      "loss": 0.3883,
-      "step": 1415
-    },
-    {
-      "epoch": 6.977886977886978,
-      "grad_norm": 0.95703125,
-      "learning_rate": 5.014899005067524e-05,
-      "loss": 0.3899,
-      "step": 1420
-    },
-    {
-      "epoch": 6.997542997542998,
-      "eval_loss": 3.898437023162842,
-      "eval_runtime": 2.0525,
-      "eval_samples_per_second": 4.872,
-      "eval_steps_per_second": 0.974,
-      "step": 1424
-    },
-    {
-      "epoch": 7.002457002457002,
-      "grad_norm": 1.0546875,
-      "learning_rate": 4.940552259810063e-05,
-      "loss": 0.3846,
-      "step": 1425
-    },
-    {
-      "epoch": 7.027027027027027,
-      "grad_norm": 0.82421875,
-      "learning_rate": 4.866579507229545e-05,
-      "loss": 0.3325,
-      "step": 1430
-    },
-    {
-      "epoch": 7.051597051597051,
-      "grad_norm": 1.0,
-      "learning_rate": 4.792986215366976e-05,
-      "loss": 0.3266,
-      "step": 1435
-    },
-    {
-      "epoch": 7.076167076167076,
-      "grad_norm": 1.0078125,
-      "learning_rate": 4.7197778242137755e-05,
-      "loss": 0.3295,
-      "step": 1440
-    },
-    {
-      "epoch": 7.100737100737101,
-      "grad_norm": 0.80078125,
-      "learning_rate": 4.646959745309609e-05,
-      "loss": 0.3279,
-      "step": 1445
-    },
-    {
-      "epoch": 7.125307125307125,
-      "grad_norm": 0.96875,
-      "learning_rate": 4.574537361342407e-05,
-      "loss": 0.324,
-      "step": 1450
-    },
-    {
-      "epoch": 7.14987714987715,
-      "grad_norm": 0.87109375,
-      "learning_rate": 4.502516025750455e-05,
-      "loss": 0.319,
-      "step": 1455
-    },
-    {
-      "epoch": 7.174447174447175,
-      "grad_norm": 0.8125,
-      "learning_rate": 4.430901062326681e-05,
-      "loss": 0.3242,
-      "step": 1460
-    },
-    {
-      "epoch": 7.199017199017199,
-      "grad_norm": 0.8515625,
-      "learning_rate": 4.359697764825123e-05,
-      "loss": 0.3172,
-      "step": 1465
-    },
-    {
-      "epoch": 7.223587223587224,
-      "grad_norm": 0.9140625,
-      "learning_rate": 4.288911396569599e-05,
-      "loss": 0.3275,
-      "step": 1470
-    },
-    {
-      "epoch": 7.2481572481572485,
-      "grad_norm": 0.8828125,
-      "learning_rate": 4.21854719006467e-05,
-      "loss": 0.332,
-      "step": 1475
-    },
-    {
-      "epoch": 7.2727272727272725,
-      "grad_norm": 1.015625,
-      "learning_rate": 4.148610346608837e-05,
-      "loss": 0.3359,
-      "step": 1480
-    },
-    {
-      "epoch": 7.297297297297297,
-      "grad_norm": 0.87109375,
-      "learning_rate": 4.079106035910073e-05,
-      "loss": 0.3242,
-      "step": 1485
-    },
-    {
-      "epoch": 7.321867321867322,
-      "grad_norm": 0.86328125,
-      "learning_rate": 4.010039395703664e-05,
-      "loss": 0.3273,
-      "step": 1490
-    },
-    {
-      "epoch": 7.346437346437346,
-      "grad_norm": 0.9140625,
-      "learning_rate": 3.94141553137245e-05,
-      "loss": 0.3274,
-      "step": 1495
-    },
-    {
-      "epoch": 7.371007371007371,
-      "grad_norm": 0.88671875,
-      "learning_rate": 3.873239515569429e-05,
-      "loss": 0.3266,
-      "step": 1500
-    },
-    {
-      "epoch": 7.395577395577396,
-      "grad_norm": 0.84765625,
-      "learning_rate": 3.80551638784277e-05,
-      "loss": 0.3363,
-      "step": 1505
-    },
-    {
-      "epoch": 7.42014742014742,
-      "grad_norm": 0.87109375,
-      "learning_rate": 3.738251154263333e-05,
-      "loss": 0.335,
-      "step": 1510
-    },
-    {
-      "epoch": 7.444717444717445,
-      "grad_norm": 0.84765625,
-      "learning_rate": 3.671448787054571e-05,
-      "loss": 0.3305,
-      "step": 1515
-    },
-    {
-      "epoch": 7.469287469287469,
-      "grad_norm": 0.8515625,
-      "learning_rate": 3.605114224225028e-05,
-      "loss": 0.332,
-      "step": 1520
-    },
-    {
-      "epoch": 7.493857493857494,
-      "grad_norm": 0.89453125,
-      "learning_rate": 3.5392523692033006e-05,
-      "loss": 0.3261,
-      "step": 1525
-    },
-    {
-      "epoch": 7.518427518427519,
-      "grad_norm": 1.09375,
-      "learning_rate": 3.473868090475574e-05,
-      "loss": 0.3363,
-      "step": 1530
-    },
-    {
-      "epoch": 7.542997542997543,
-      "grad_norm": 0.88671875,
-      "learning_rate": 3.408966221225773e-05,
-      "loss": 0.3239,
-      "step": 1535
-    },
-    {
-      "epoch": 7.5675675675675675,
-      "grad_norm": 0.875,
-      "learning_rate": 3.3445515589782574e-05,
-      "loss": 0.3301,
-      "step": 1540
-    },
-    {
-      "epoch": 7.592137592137592,
-      "grad_norm": 0.875,
-      "learning_rate": 3.2806288652432174e-05,
-      "loss": 0.3228,
-      "step": 1545
-    },
-    {
-      "epoch": 7.616707616707616,
-      "grad_norm": 0.8828125,
-      "learning_rate": 3.217202865164697e-05,
-      "loss": 0.3389,
-      "step": 1550
-    },
-    {
-      "epoch": 7.641277641277641,
-      "grad_norm": 0.859375,
-      "learning_rate": 3.154278247171314e-05,
-      "loss": 0.3302,
-      "step": 1555
-    },
-    {
-      "epoch": 7.665847665847666,
-      "grad_norm": 0.89453125,
-      "learning_rate": 3.09185966262968e-05,
-      "loss": 0.3245,
-      "step": 1560
-    },
-    {
-      "epoch": 7.69041769041769,
-      "grad_norm": 0.86328125,
-      "learning_rate": 3.0299517255005937e-05,
-      "loss": 0.3306,
-      "step": 1565
-    },
-    {
-      "epoch": 7.714987714987715,
-      "grad_norm": 0.90625,
-      "learning_rate": 2.9685590119979688e-05,
-      "loss": 0.3322,
-      "step": 1570
-    },
-    {
-      "epoch": 7.739557739557739,
-      "grad_norm": 0.8828125,
-      "learning_rate": 2.9076860602505564e-05,
-      "loss": 0.3327,
-      "step": 1575
-    },
-    {
-      "epoch": 7.764127764127764,
-      "grad_norm": 0.953125,
-      "learning_rate": 2.8473373699664997e-05,
-      "loss": 0.3342,
-      "step": 1580
-    },
-    {
-      "epoch": 7.788697788697789,
-      "grad_norm": 0.89453125,
-      "learning_rate": 2.7875174021007e-05,
-      "loss": 0.3318,
-      "step": 1585
-    },
-    {
-      "epoch": 7.813267813267814,
-      "grad_norm": 0.85546875,
-      "learning_rate": 2.728230578525086e-05,
-      "loss": 0.3357,
-      "step": 1590
-    },
-    {
-      "epoch": 7.837837837837838,
-      "grad_norm": 0.8984375,
-      "learning_rate": 2.669481281701739e-05,
-      "loss": 0.3345,
-      "step": 1595
-    },
-    {
-      "epoch": 7.862407862407863,
-      "grad_norm": 0.94921875,
-      "learning_rate": 2.6112738543589312e-05,
-      "loss": 0.3324,
-      "step": 1600
-    },
-    {
-      "epoch": 7.886977886977887,
-      "grad_norm": 0.86328125,
-      "learning_rate": 2.553612599170143e-05,
-      "loss": 0.3278,
-      "step": 1605
-    },
-    {
-      "epoch": 7.9115479115479115,
-      "grad_norm": 0.8046875,
-      "learning_rate": 2.496501778435977e-05,
-      "loss": 0.323,
-      "step": 1610
-    },
-    {
-      "epoch": 7.936117936117936,
-      "grad_norm": 0.953125,
-      "learning_rate": 2.4399456137691147e-05,
-      "loss": 0.3364,
-      "step": 1615
-    },
-    {
-      "epoch": 7.96068796068796,
-      "grad_norm": 0.85546875,
-      "learning_rate": 2.3839482857822458e-05,
-      "loss": 0.3348,
-      "step": 1620
-    },
-    {
-      "epoch": 7.985257985257985,
-      "grad_norm": 0.92578125,
-      "learning_rate": 2.328513933779034e-05,
-      "loss": 0.3321,
-      "step": 1625
-    },
-    {
-      "epoch": 8.0,
-      "eval_loss": 4.43484354019165,
-      "eval_runtime": 2.0469,
-      "eval_samples_per_second": 4.885,
-      "eval_steps_per_second": 0.977,
-      "step": 1628
-    },
-    {
-      "epoch": 8.00982800982801,
-      "grad_norm": 0.703125,
-      "learning_rate": 2.2736466554481617e-05,
-      "loss": 0.3114,
-      "step": 1630
-    },
-    {
-      "epoch": 8.034398034398034,
-      "grad_norm": 0.78125,
-      "learning_rate": 2.2193505065604014e-05,
-      "loss": 0.3037,
-      "step": 1635
-    },
-    {
-      "epoch": 8.058968058968059,
-      "grad_norm": 0.78515625,
-      "learning_rate": 2.1656295006688353e-05,
-      "loss": 0.2959,
-      "step": 1640
-    },
-    {
-      "epoch": 8.083538083538084,
-      "grad_norm": 0.88671875,
-      "learning_rate": 2.1124876088121692e-05,
-      "loss": 0.3102,
-      "step": 1645
-    },
-    {
-      "epoch": 8.108108108108109,
-      "grad_norm": 0.83984375,
-      "learning_rate": 2.0599287592211968e-05,
-      "loss": 0.2975,
-      "step": 1650
-    },
-    {
-      "epoch": 8.132678132678132,
-      "grad_norm": 0.82421875,
-      "learning_rate": 2.0079568370284128e-05,
-      "loss": 0.2934,
-      "step": 1655
-    },
-    {
-      "epoch": 8.157248157248157,
-      "grad_norm": 0.84765625,
-      "learning_rate": 1.956575683980846e-05,
-      "loss": 0.299,
-      "step": 1660
-    },
-    {
-      "epoch": 8.181818181818182,
-      "grad_norm": 0.81640625,
-      "learning_rate": 1.9057890981560677e-05,
-      "loss": 0.2952,
-      "step": 1665
-    },
-    {
-      "epoch": 8.206388206388207,
-      "grad_norm": 0.8046875,
-      "learning_rate": 1.85560083368143e-05,
-      "loss": 0.3067,
-      "step": 1670
-    },
-    {
-      "epoch": 8.230958230958231,
-      "grad_norm": 0.8203125,
-      "learning_rate": 1.806014600456588e-05,
-      "loss": 0.2978,
-      "step": 1675
-    },
-    {
-      "epoch": 8.255528255528256,
-      "grad_norm": 0.80078125,
-      "learning_rate": 1.757034063879235e-05,
-      "loss": 0.2973,
-      "step": 1680
-    },
-    {
-      "epoch": 8.28009828009828,
-      "grad_norm": 0.796875,
-      "learning_rate": 1.708662844574178e-05,
-      "loss": 0.2983,
-      "step": 1685
-    },
-    {
-      "epoch": 8.304668304668304,
-      "grad_norm": 0.78125,
-      "learning_rate": 1.6609045181256976e-05,
-      "loss": 0.2991,
-      "step": 1690
-    },
-    {
-      "epoch": 8.32923832923833,
-      "grad_norm": 0.8203125,
-      "learning_rate": 1.61376261481323e-05,
-      "loss": 0.3102,
-      "step": 1695
-    },
-    {
-      "epoch": 8.353808353808354,
-      "grad_norm": 0.8828125,
-      "learning_rate": 1.5672406193504384e-05,
-      "loss": 0.2984,
-      "step": 1700
-    },
-    {
-      "epoch": 8.378378378378379,
-      "grad_norm": 0.7578125,
-      "learning_rate": 1.5213419706275878e-05,
-      "loss": 0.298,
-      "step": 1705
-    },
-    {
-      "epoch": 8.402948402948404,
-      "grad_norm": 0.8046875,
-      "learning_rate": 1.4760700614573731e-05,
-      "loss": 0.3021,
-      "step": 1710
-    },
-    {
-      "epoch": 8.427518427518427,
-      "grad_norm": 0.84375,
-      "learning_rate": 1.4314282383241096e-05,
-      "loss": 0.3068,
-      "step": 1715
-    },
-    {
-      "epoch": 8.452088452088452,
-      "grad_norm": 0.82421875,
-      "learning_rate": 1.3874198011363582e-05,
-      "loss": 0.3038,
-      "step": 1720
-    },
-    {
-      "epoch": 8.476658476658477,
-      "grad_norm": 0.8359375,
-      "learning_rate": 1.3440480029830127e-05,
-      "loss": 0.3024,
-      "step": 1725
-    },
-    {
-      "epoch": 8.501228501228502,
-      "grad_norm": 0.86328125,
-      "learning_rate": 1.301316049892818e-05,
-      "loss": 0.3018,
-      "step": 1730
-    },
-    {
-      "epoch": 8.525798525798526,
-      "grad_norm": 0.87890625,
-      "learning_rate": 1.2592271005973888e-05,
-      "loss": 0.3034,
-      "step": 1735
-    },
-    {
-      "epoch": 8.55036855036855,
-      "grad_norm": 0.8046875,
-      "learning_rate": 1.2177842662977135e-05,
-      "loss": 0.2999,
-      "step": 1740
-    },
-    {
-      "epoch": 8.574938574938574,
-      "grad_norm": 0.91796875,
-      "learning_rate": 1.1769906104341832e-05,
-      "loss": 0.2997,
-      "step": 1745
-    },
-    {
-      "epoch": 8.5995085995086,
-      "grad_norm": 0.8203125,
-      "learning_rate": 1.136849148460125e-05,
-      "loss": 0.3042,
-      "step": 1750
-    },
-    {
-      "epoch": 8.624078624078624,
-      "grad_norm": 0.88671875,
-      "learning_rate": 1.0973628476189257e-05,
-      "loss": 0.2933,
-      "step": 1755
-    },
-    {
-      "epoch": 8.64864864864865,
-      "grad_norm": 0.8046875,
-      "learning_rate": 1.0585346267246743e-05,
-      "loss": 0.2983,
-      "step": 1760
-    },
-    {
-      "epoch": 8.673218673218674,
-      "grad_norm": 0.7890625,
-      "learning_rate": 1.0203673559464089e-05,
-      "loss": 0.2963,
-      "step": 1765
-    },
-    {
-      "epoch": 8.697788697788697,
-      "grad_norm": 0.83984375,
-      "learning_rate": 9.82863856595968e-06,
-      "loss": 0.2952,
-      "step": 1770
-    },
-    {
-      "epoch": 8.722358722358722,
-      "grad_norm": 0.8359375,
-      "learning_rate": 9.460269009194167e-06,
-      "loss": 0.304,
-      "step": 1775
-    },
-    {
-      "epoch": 8.746928746928747,
-      "grad_norm": 0.84765625,
-      "learning_rate": 9.098592118921435e-06,
-      "loss": 0.3037,
-      "step": 1780
-    },
-    {
-      "epoch": 8.771498771498772,
-      "grad_norm": 0.79296875,
-      "learning_rate": 8.74363463017569e-06,
-      "loss": 0.3058,
-      "step": 1785
-    },
-    {
-      "epoch": 8.796068796068797,
-      "grad_norm": 1.03125,
-      "learning_rate": 8.395422781295192e-06,
-      "loss": 0.3031,
-      "step": 1790
-    },
-    {
-      "epoch": 8.82063882063882,
-      "grad_norm": 0.875,
-      "learning_rate": 8.053982311982867e-06,
-      "loss": 0.3017,
-      "step": 1795
-    },
-    {
-      "epoch": 8.845208845208845,
-      "grad_norm": 0.9453125,
-      "learning_rate": 7.719338461403435e-06,
-      "loss": 0.2978,
-      "step": 1800
-    },
-    {
-      "epoch": 8.86977886977887,
-      "grad_norm": 0.82421875,
-      "learning_rate": 7.3915159663179075e-06,
-      "loss": 0.2905,
-      "step": 1805
-    },
-    {
-      "epoch": 8.894348894348894,
-      "grad_norm": 0.81640625,
-      "learning_rate": 7.070539059254977e-06,
-      "loss": 0.2957,
-      "step": 1810
-    },
-    {
-      "epoch": 8.91891891891892,
-      "grad_norm": 0.7890625,
-      "learning_rate": 6.756431466719737e-06,
-      "loss": 0.2988,
-      "step": 1815
-    },
-    {
-      "epoch": 8.943488943488944,
-      "grad_norm": 0.84375,
-      "learning_rate": 6.4492164074399065e-06,
-      "loss": 0.3019,
-      "step": 1820
-    },
-    {
-      "epoch": 8.968058968058967,
-      "grad_norm": 0.8359375,
-      "learning_rate": 6.148916590649434e-06,
-      "loss": 0.3051,
-      "step": 1825
-    },
-    {
-      "epoch": 8.992628992628992,
-      "grad_norm": 0.859375,
-      "learning_rate": 5.8555542144098865e-06,
-      "loss": 0.3089,
-      "step": 1830
-    },
-    {
-      "epoch": 8.997542997542997,
-      "eval_loss": 4.689078330993652,
-      "eval_runtime": 2.054,
-      "eval_samples_per_second": 4.868,
-      "eval_steps_per_second": 0.974,
-      "step": 1831
-    },
-    {
-      "epoch": 9.017199017199017,
-      "grad_norm": 0.78515625,
-      "learning_rate": 5.569150963969494e-06,
-      "loss": 0.2973,
-      "step": 1835
-    },
-    {
-      "epoch": 9.041769041769042,
-      "grad_norm": 0.80078125,
-      "learning_rate": 5.289728010160366e-06,
-      "loss": 0.3,
-      "step": 1840
-    },
-    {
-      "epoch": 9.066339066339067,
-      "grad_norm": 0.7578125,
-      "learning_rate": 5.0173060078333225e-06,
-      "loss": 0.299,
-      "step": 1845
-    },
-    {
-      "epoch": 9.090909090909092,
-      "grad_norm": 0.796875,
-      "learning_rate": 4.7519050943312325e-06,
-      "loss": 0.2971,
-      "step": 1850
-    },
-    {
-      "epoch": 9.115479115479115,
-      "grad_norm": 0.78125,
-      "learning_rate": 4.493544888000467e-06,
-      "loss": 0.2968,
-      "step": 1855
-    },
-    {
-      "epoch": 9.14004914004914,
-      "grad_norm": 0.765625,
-      "learning_rate": 4.242244486740643e-06,
-      "loss": 0.295,
-      "step": 1860
-    },
-    {
-      "epoch": 9.164619164619165,
-      "grad_norm": 0.8125,
-      "learning_rate": 3.99802246659301e-06,
-      "loss": 0.2911,
-      "step": 1865
-    },
-    {
-      "epoch": 9.18918918918919,
-      "grad_norm": 0.76953125,
-      "learning_rate": 3.760896880367215e-06,
-      "loss": 0.2916,
-      "step": 1870
-    },
-    {
-      "epoch": 9.213759213759214,
-      "grad_norm": 0.7890625,
-      "learning_rate": 3.530885256306915e-06,
-      "loss": 0.2998,
-      "step": 1875
-    },
-    {
-      "epoch": 9.238329238329237,
-      "grad_norm": 0.8046875,
-      "learning_rate": 3.308004596794101e-06,
-      "loss": 0.2948,
-      "step": 1880
-    },
-    {
-      "epoch": 9.262899262899262,
-      "grad_norm": 0.81640625,
-      "learning_rate": 3.092271377092215e-06,
-      "loss": 0.2917,
-      "step": 1885
-    },
-    {
-      "epoch": 9.287469287469287,
-      "grad_norm": 0.75,
-      "learning_rate": 2.8837015441283586e-06,
-      "loss": 0.295,
-      "step": 1890
-    },
-    {
-      "epoch": 9.312039312039312,
-      "grad_norm": 0.74609375,
-      "learning_rate": 2.682310515314512e-06,
-      "loss": 0.2928,
-      "step": 1895
-    },
-    {
-      "epoch": 9.336609336609337,
-      "grad_norm": 0.8046875,
-      "learning_rate": 2.488113177407869e-06,
-      "loss": 0.2909,
-      "step": 1900
-    },
-    {
-      "epoch": 9.361179361179362,
-      "grad_norm": 0.75,
-      "learning_rate": 2.3011238854103947e-06,
-      "loss": 0.2929,
-      "step": 1905
-    },
-    {
-      "epoch": 9.385749385749385,
-      "grad_norm": 0.796875,
-      "learning_rate": 2.1213564615077065e-06,
-      "loss": 0.2936,
-      "step": 1910
-    },
-    {
-      "epoch": 9.41031941031941,
-      "grad_norm": 0.79296875,
-      "learning_rate": 1.9488241940473828e-06,
-      "loss": 0.2941,
-      "step": 1915
-    },
-    {
-      "epoch": 9.434889434889435,
-      "grad_norm": 0.76171875,
-      "learning_rate": 1.783539836556669e-06,
-      "loss": 0.2941,
-      "step": 1920
-    },
-    {
-      "epoch": 9.45945945945946,
-      "grad_norm": 0.78515625,
-      "learning_rate": 1.6255156067997323e-06,
-      "loss": 0.2869,
-      "step": 1925
-    },
-    {
-      "epoch": 9.484029484029485,
-      "grad_norm": 0.80859375,
-      "learning_rate": 1.474763185874517e-06,
-      "loss": 0.3029,
-      "step": 1930
-    },
-    {
-      "epoch": 9.50859950859951,
-      "grad_norm": 0.80859375,
-      "learning_rate": 1.3312937173493577e-06,
-      "loss": 0.2943,
-      "step": 1935
-    },
-    {
-      "epoch": 9.533169533169533,
-      "grad_norm": 0.83984375,
-      "learning_rate": 1.19511780643915e-06,
-      "loss": 0.2963,
-      "step": 1940
-    },
-    {
-      "epoch": 9.557739557739557,
-      "grad_norm": 0.765625,
-      "learning_rate": 1.066245519221465e-06,
-      "loss": 0.3002,
-      "step": 1945
-    },
-    {
-      "epoch": 9.582309582309582,
-      "grad_norm": 0.76171875,
-      "learning_rate": 9.446863818924679e-07,
-      "loss": 0.2988,
-      "step": 1950
-    },
-    {
-      "epoch": 9.606879606879607,
-      "grad_norm": 0.77734375,
-      "learning_rate": 8.304493800627589e-07,
-      "loss": 0.297,
-      "step": 1955
-    },
-    {
-      "epoch": 9.631449631449632,
-      "grad_norm": 0.75,
-      "learning_rate": 7.235429580931152e-07,
-      "loss": 0.2902,
-      "step": 1960
-    },
-    {
-      "epoch": 9.656019656019655,
-      "grad_norm": 0.859375,
-      "learning_rate": 6.239750184703464e-07,
-      "loss": 0.2933,
-      "step": 1965
-    },
-    {
-      "epoch": 9.68058968058968,
-      "grad_norm": 0.76953125,
-      "learning_rate": 5.317529212230721e-07,
-      "loss": 0.2943,
-      "step": 1970
-    },
-    {
-      "epoch": 9.705159705159705,
-      "grad_norm": 0.78125,
-      "learning_rate": 4.4688348337774686e-07,
-      "loss": 0.3049,
-      "step": 1975
-    },
-    {
-      "epoch": 9.72972972972973,
-      "grad_norm": 0.83984375,
-      "learning_rate": 3.693729784546962e-07,
-      "loss": 0.2876,
-      "step": 1980
-    },
-    {
-      "epoch": 9.754299754299755,
-      "grad_norm": 0.78515625,
-      "learning_rate": 2.9922713600439854e-07,
-      "loss": 0.2971,
-      "step": 1985
-    },
-    {
-      "epoch": 9.77886977886978,
-      "grad_norm": 0.765625,
-      "learning_rate": 2.3645114118395762e-07,
-      "loss": 0.288,
-      "step": 1990
-    },
-    {
-      "epoch": 9.803439803439803,
-      "grad_norm": 0.84765625,
-      "learning_rate": 1.8104963437381993e-07,
-      "loss": 0.2951,
-      "step": 1995
-    },
-    {
-      "epoch": 9.828009828009828,
-      "grad_norm": 0.7578125,
-      "learning_rate": 1.3302671083474938e-07,
-      "loss": 0.2927,
-      "step": 2000
-    },
-    {
-      "epoch": 9.852579852579852,
-      "grad_norm": 0.78515625,
-      "learning_rate": 9.238592040512472e-08,
-      "loss": 0.2932,
-      "step": 2005
-    },
-    {
-      "epoch": 9.877149877149877,
-      "grad_norm": 0.75390625,
-      "learning_rate": 5.913026723850523e-08,
-      "loss": 0.2913,
-      "step": 2010
-    },
-    {
-      "epoch": 9.901719901719902,
-      "grad_norm": 0.8046875,
-      "learning_rate": 3.3262209581619297e-08,
-      "loss": 0.2997,
-      "step": 2015
-    },
-    {
-      "epoch": 9.926289926289925,
-      "grad_norm": 0.85546875,
-      "learning_rate": 1.4783659592576548e-08,
-      "loss": 0.2977,
-      "step": 2020
-    },
-    {
-      "epoch": 9.95085995085995,
-      "grad_norm": 0.80078125,
-      "learning_rate": 3.6959831996030704e-09,
-      "loss": 0.2988,
-      "step": 2025
-    },
-    {
-      "epoch": 9.975429975429975,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.0,
-      "loss": 0.3016,
-      "step": 2030
-    },
-    {
-      "epoch": 9.975429975429975,
-      "eval_loss": 4.717045307159424,
-      "eval_runtime": 2.0451,
-      "eval_samples_per_second": 4.89,
-      "eval_steps_per_second": 0.978,
-      "step": 2030
     },
     {
-      "epoch": 9.975429975429975,
-      "step": 2030,
-      "total_flos": 1.5518062706111283e+18,
-      "train_loss": 1.3545548074938394,
-      "train_runtime": 13205.4864,
-      "train_samples_per_second": 2.465,
-      "train_steps_per_second": 0.154
     }
   ],
   "logging_steps": 5,
-  "max_steps": 2030,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 10,
   "save_steps": 100,
-  "total_flos": 1.5518062706111283e+18,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 10.0,
   "eval_steps": 500,
+  "global_step": 550,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.01818181818181818,
+      "grad_norm": 251.0,
+      "learning_rate": 3.636363636363636e-06,
+      "loss": 46.8778,
       "step": 1
     },
     {
+      "epoch": 0.09090909090909091,
+      "grad_norm": 239.0,
+      "learning_rate": 1.8181818181818182e-05,
+      "loss": 46.8224,
       "step": 5
     },
     {
+      "epoch": 0.18181818181818182,
+      "grad_norm": 106.5,
+      "learning_rate": 3.6363636363636364e-05,
+      "loss": 39.5877,
       "step": 10
     },
     {
+      "epoch": 0.2727272727272727,
+      "grad_norm": 19.75,
+      "learning_rate": 5.4545454545454546e-05,
+      "loss": 28.2487,
       "step": 15
     },
     {
+      "epoch": 0.36363636363636365,
+      "grad_norm": 15.8125,
+      "learning_rate": 7.272727272727273e-05,
+      "loss": 23.0407,
       "step": 20
     },
     {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 5.5,
+      "learning_rate": 9.090909090909092e-05,
+      "loss": 20.515,
       "step": 25
     },
     {
+      "epoch": 0.5454545454545454,
+      "grad_norm": 4.1875,
+      "learning_rate": 0.00010909090909090909,
+      "loss": 19.5202,
       "step": 30
     },
     {
+      "epoch": 0.6363636363636364,
+      "grad_norm": 5.75,
+      "learning_rate": 0.00012727272727272728,
+      "loss": 18.2599,
       "step": 35
     },
     {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 10.8125,
+      "learning_rate": 0.00014545454545454546,
+      "loss": 16.923,
       "step": 40
     },
     {
+      "epoch": 0.8181818181818182,
+      "grad_norm": 21.25,
+      "learning_rate": 0.00016363636363636366,
+      "loss": 14.3261,
       "step": 45
     },
     {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 29.625,
+      "learning_rate": 0.00018181818181818183,
+      "loss": 9.0683,
       "step": 50
     },
     {
+      "epoch": 1.0,
+      "grad_norm": 9.625,
+      "learning_rate": 0.0002,
+      "loss": 3.5798,
+      "step": 55
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 4.059033393859863,
+      "eval_runtime": 0.2418,
+      "eval_samples_per_second": 41.356,
+      "eval_steps_per_second": 4.136,
       "step": 55
     },
     {
+      "epoch": 1.0909090909090908,
+      "grad_norm": 10.125,
+      "learning_rate": 0.00019994965423831854,
+      "loss": 2.5621,
       "step": 60
     },
     {
+      "epoch": 1.1818181818181819,
+      "grad_norm": 7.4375,
+      "learning_rate": 0.00019979866764718843,
+      "loss": 2.0701,
       "step": 65
     },
     {
+      "epoch": 1.2727272727272727,
+      "grad_norm": 20.125,
+      "learning_rate": 0.00019954719225730847,
+      "loss": 1.8568,
       "step": 70
     },
     {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00019919548128307954,
+      "loss": 1.6836,
       "step": 75
     },
     {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 1.078125,
+      "learning_rate": 0.00019874388886763944,
+      "loss": 1.5757,
       "step": 80
     },
     {
+      "epoch": 1.5454545454545454,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00019819286972627066,
+      "loss": 1.52,
       "step": 85
     },
     {
+      "epoch": 1.6363636363636362,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00019754297868854073,
+      "loss": 1.4757,
       "step": 90
     },
     {
+      "epoch": 1.7272727272727273,
+      "grad_norm": 3.46875,
+      "learning_rate": 0.00019679487013963564,
+      "loss": 1.4221,
       "step": 95
     },
     {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.00019594929736144976,
+      "loss": 1.4012,
       "step": 100
     },
     {
+      "epoch": 1.9090909090909092,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.00019500711177409454,
+      "loss": 1.3672,
       "step": 105
     },
     {
+      "epoch": 2.0,
+      "grad_norm": 0.6953125,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 1.346,
+      "step": 110
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 2.6943764686584473,
+      "eval_runtime": 0.2333,
+      "eval_samples_per_second": 42.871,
+      "eval_steps_per_second": 4.287,
       "step": 110
     },
     {
+      "epoch": 2.090909090909091,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00019283679330160726,
+      "loss": 1.3092,
       "step": 115
     },
     {
+      "epoch": 2.1818181818181817,
+      "grad_norm": 1.8828125,
+      "learning_rate": 0.00019161084574320696,
+      "loss": 1.3052,
       "step": 120
     },
     {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 0.625,
+      "learning_rate": 0.00019029265382866214,
+      "loss": 1.2724,
       "step": 125
     },
     {
+      "epoch": 2.3636363636363638,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00018888354486549237,
+      "loss": 1.2612,
       "step": 130
     },
     {
+      "epoch": 2.4545454545454546,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00018738493770697852,
+      "loss": 1.2532,
       "step": 135
     },
     {
+      "epoch": 2.5454545454545454,
+      "grad_norm": 4.25,
+      "learning_rate": 0.00018579834132349772,
+      "loss": 1.2422,
       "step": 140
     },
     {
+      "epoch": 2.6363636363636362,
+      "grad_norm": 0.62890625,
+      "learning_rate": 0.00018412535328311814,
+      "loss": 1.2256,
       "step": 145
     },
     {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 2.421875,
+      "learning_rate": 0.0001823676581429833,
+      "loss": 1.2271,
       "step": 150
     },
     {
+      "epoch": 2.8181818181818183,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.00018052702575310588,
+      "loss": 1.2114,
       "step": 155
     },
     {
+      "epoch": 2.909090909090909,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.00017860530947427875,
+      "loss": 1.2068,
       "step": 160
     },
     {
+      "epoch": 3.0,
+      "grad_norm": 10.6875,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 1.1944,
       "step": 165
     },
     {
+      "epoch": 3.0,
+      "eval_loss": 2.6026840209960938,
+      "eval_runtime": 0.2422,
+      "eval_samples_per_second": 41.281,
+      "eval_steps_per_second": 4.128,
+      "step": 165
+    },
+    {
+      "epoch": 3.090909090909091,
+      "grad_norm": 1.9140625,
+      "learning_rate": 0.0001745264449675755,
+      "loss": 1.194,
       "step": 170
     },
     {
+      "epoch": 3.1818181818181817,
+      "grad_norm": 3.515625,
+      "learning_rate": 0.00017237340381050703,
+      "loss": 1.1668,
       "step": 175
     },
     {
+      "epoch": 3.2727272727272725,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.00017014748877063214,
+      "loss": 1.1693,
       "step": 180
     },
     {
+      "epoch": 3.3636363636363638,
+      "grad_norm": 2.109375,
+      "learning_rate": 0.00016785094115571322,
+      "loss": 1.167,
       "step": 185
     },
     {
+      "epoch": 3.4545454545454546,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 1.1601,
       "step": 190
     },
     {
+      "epoch": 3.5454545454545454,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00016305526670845226,
+      "loss": 1.1343,
       "step": 195
     },
     {
+      "epoch": 3.6363636363636362,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00016056096871376667,
+      "loss": 1.1379,
       "step": 200
     },
     {
+      "epoch": 3.7272727272727275,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.00015800569095711982,
+      "loss": 1.1339,
       "step": 205
     },
     {
+      "epoch": 3.8181818181818183,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00015539200638661104,
+      "loss": 1.1393,
       "step": 210
     },
     {
+      "epoch": 3.909090909090909,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.00015272254676105025,
+      "loss": 1.1015,
       "step": 215
     },
     {
+      "epoch": 4.0,
+      "grad_norm": 0.84765625,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1119,
       "step": 220
     },
     {
+      "epoch": 4.0,
+      "eval_loss": 2.5778679847717285,
+      "eval_runtime": 0.2336,
+      "eval_samples_per_second": 42.816,
+      "eval_steps_per_second": 4.282,
+      "step": 220
+    },
+    {
+      "epoch": 4.090909090909091,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.0001472271074772683,
+      "loss": 1.113,
       "step": 225
     },
     {
+      "epoch": 4.181818181818182,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00014440666126057744,
+      "loss": 1.0963,
       "step": 230
     },
     {
+      "epoch": 4.2727272727272725,
+      "grad_norm": 1.3359375,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 1.104,
       "step": 235
     },
     {
+      "epoch": 4.363636363636363,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00013863451256931287,
+      "loss": 1.0957,
       "step": 240
     },
     {
+      "epoch": 4.454545454545454,
+      "grad_norm": 1.9375,
+      "learning_rate": 0.00013568862215918717,
+      "loss": 1.0835,
       "step": 245
     },
     {
+      "epoch": 4.545454545454545,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00013270679633174218,
+      "loss": 1.0802,
       "step": 250
     },
     {
+      "epoch": 4.636363636363637,
+      "grad_norm": 1.8515625,
+      "learning_rate": 0.0001296920375328275,
+      "loss": 1.0762,
       "step": 255
     },
     {
+      "epoch": 4.7272727272727275,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.00012664738136900348,
+      "loss": 1.0773,
       "step": 260
     },
     {
+      "epoch": 4.818181818181818,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.00012357589355094275,
+      "loss": 1.082,
       "step": 265
     },
     {
+      "epoch": 4.909090909090909,
+      "grad_norm": 1.7578125,
+      "learning_rate": 0.00012048066680651908,
+      "loss": 1.0687,
       "step": 270
     },
     {
+      "epoch": 5.0,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 1.0741,
       "step": 275
     },
     {
+      "epoch": 5.0,
+      "eval_loss": 2.5495071411132812,
+      "eval_runtime": 0.2341,
+      "eval_samples_per_second": 42.713,
+      "eval_steps_per_second": 4.271,
+      "step": 275
+    },
+    {
+      "epoch": 5.090909090909091,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 1.0546,
       "step": 280
     },
     {
+      "epoch": 5.181818181818182,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.00011108381999010111,
+      "loss": 1.0566,
       "step": 285
     },
     {
+      "epoch": 5.2727272727272725,
+      "grad_norm": 0.8984375,
+      "learning_rate": 0.00010792499568567884,
+      "loss": 1.0477,
       "step": 290
     },
     {
+      "epoch": 5.363636363636363,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.00010475819158237425,
+      "loss": 1.0483,
       "step": 295
     },
     {
+      "epoch": 5.454545454545454,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.00010158659638348081,
+      "loss": 1.0542,
       "step": 300
     },
     {
+      "epoch": 5.545454545454545,
+      "grad_norm": 0.57421875,
+      "learning_rate": 9.84134036165192e-05,
+      "loss": 1.0437,
       "step": 305
     },
     {
+      "epoch": 5.636363636363637,
+      "grad_norm": 0.76953125,
+      "learning_rate": 9.524180841762577e-05,
+      "loss": 1.0542,
       "step": 310
     },
     {
+      "epoch": 5.7272727272727275,
+      "grad_norm": 0.546875,
+      "learning_rate": 9.207500431432115e-05,
+      "loss": 1.0424,
       "step": 315
     },
     {
+      "epoch": 5.818181818181818,
+      "grad_norm": 0.494140625,
+      "learning_rate": 8.891618000989891e-05,
+      "loss": 1.0356,
       "step": 320
     },
     {
+      "epoch": 5.909090909090909,
+      "grad_norm": 0.58203125,
+      "learning_rate": 8.57685161726715e-05,
+      "loss": 1.0432,
       "step": 325
     },
     {
+      "epoch": 6.0,
+      "grad_norm": 0.55078125,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 1.0435,
+      "step": 330
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 2.5500569343566895,
+      "eval_runtime": 0.2352,
+      "eval_samples_per_second": 42.524,
+      "eval_steps_per_second": 4.252,
       "step": 330
     },
     {
+      "epoch": 6.090909090909091,
+      "grad_norm": 0.59375,
+      "learning_rate": 7.951933319348095e-05,
+      "loss": 1.0185,
       "step": 335
     },
     {
+      "epoch": 6.181818181818182,
+      "grad_norm": 0.58203125,
+      "learning_rate": 7.642410644905726e-05,
+      "loss": 1.0249,
       "step": 340
     },
     {
+      "epoch": 6.2727272727272725,
+      "grad_norm": 0.6171875,
+      "learning_rate": 7.335261863099651e-05,
+      "loss": 1.0206,
       "step": 345
     },
     {
+      "epoch": 6.363636363636363,
+      "grad_norm": 0.6015625,
+      "learning_rate": 7.030796246717255e-05,
+      "loss": 1.0209,
       "step": 350
     },
     {
+      "epoch": 6.454545454545454,
+      "grad_norm": 0.5859375,
+      "learning_rate": 6.729320366825784e-05,
+      "loss": 1.0285,
       "step": 355
     },
     {
+      "epoch": 6.545454545454545,
+      "grad_norm": 0.75,
+      "learning_rate": 6.431137784081282e-05,
+      "loss": 1.0206,
       "step": 360
     },
     {
+      "epoch": 6.636363636363637,
+      "grad_norm": 0.5859375,
+      "learning_rate": 6.136548743068713e-05,
+      "loss": 1.0245,
       "step": 365
     },
     {
+      "epoch": 6.7272727272727275,
+      "grad_norm": 0.5078125,
+      "learning_rate": 5.845849869981137e-05,
+      "loss": 1.0174,
       "step": 370
     },
     {
+      "epoch": 6.818181818181818,
+      "grad_norm": 0.75,
+      "learning_rate": 5.559333873942259e-05,
+      "loss": 1.0206,
       "step": 375
     },
     {
+      "epoch": 6.909090909090909,
+      "grad_norm": 0.828125,
+      "learning_rate": 5.277289252273174e-05,
+      "loss": 1.0244,
       "step": 380
     },
     {
+      "epoch": 7.0,
+      "grad_norm": 0.83203125,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 1.0191,
+      "step": 385
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 2.5536184310913086,
+      "eval_runtime": 0.2341,
+      "eval_samples_per_second": 42.722,
+      "eval_steps_per_second": 4.272,
       "step": 385
     },
     {
+      "epoch": 7.090909090909091,
+      "grad_norm": 0.7734375,
+      "learning_rate": 4.727745323894976e-05,
+      "loss": 1.011,
       "step": 390
     },
     {
+      "epoch": 7.181818181818182,
+      "grad_norm": 0.61328125,
+      "learning_rate": 4.4607993613388976e-05,
+      "loss": 1.0087,
       "step": 395
     },
     {
+      "epoch": 7.2727272727272725,
+      "grad_norm": 0.6875,
+      "learning_rate": 4.19943090428802e-05,
+      "loss": 1.0019,
       "step": 400
     },
     {
+      "epoch": 7.363636363636363,
+      "grad_norm": 0.7890625,
+      "learning_rate": 3.943903128623335e-05,
+      "loss": 1.0112,
       "step": 405
     },
     {
+      "epoch": 7.454545454545454,
+      "grad_norm": 0.58203125,
+      "learning_rate": 3.694473329154778e-05,
+      "loss": 1.0075,
       "step": 410
     },
     {
+      "epoch": 7.545454545454545,
+      "grad_norm": 0.5703125,
+      "learning_rate": 3.45139266054715e-05,
+      "loss": 1.0023,
       "step": 415
     },
     {
+      "epoch": 7.636363636363637,
+      "grad_norm": 0.8046875,
+      "learning_rate": 3.21490588442868e-05,
+      "loss": 1.0112,
       "step": 420
     },
     {
+      "epoch": 7.7272727272727275,
+      "grad_norm": 0.4765625,
+      "learning_rate": 2.9852511229367865e-05,
+      "loss": 1.0069,
       "step": 425
     },
     {
+      "epoch": 7.818181818181818,
+      "grad_norm": 0.45703125,
+      "learning_rate": 2.7626596189492983e-05,
+      "loss": 1.002,
       "step": 430
     },
     {
+      "epoch": 7.909090909090909,
+      "grad_norm": 0.55078125,
+      "learning_rate": 2.5473555032424533e-05,
+      "loss": 1.0076,
       "step": 435
     },
     {
+      "epoch": 8.0,
+      "grad_norm": 0.478515625,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.9965,
+      "step": 440
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 2.5604465007781982,
+      "eval_runtime": 0.24,
+      "eval_samples_per_second": 41.665,
+      "eval_steps_per_second": 4.167,
       "step": 440
     },
     {
+      "epoch": 8.090909090909092,
+      "grad_norm": 0.61328125,
+      "learning_rate": 2.139469052572127e-05,
+      "loss": 1.0043,
       "step": 445
     },
     {
+      "epoch": 8.181818181818182,
+      "grad_norm": 0.58984375,
+      "learning_rate": 1.947297424689414e-05,
+      "loss": 0.9997,
       "step": 450
     },
     {
+      "epoch": 8.272727272727273,
+      "grad_norm": 0.57421875,
+      "learning_rate": 1.763234185701673e-05,
+      "loss": 0.9942,
       "step": 455
     },
     {
+      "epoch": 8.363636363636363,
+      "grad_norm": 0.5625,
+      "learning_rate": 1.587464671688187e-05,
+      "loss": 0.9952,
       "step": 460
     },
     {
+      "epoch": 8.454545454545455,
+      "grad_norm": 0.453125,
+      "learning_rate": 1.4201658676502294e-05,
+      "loss": 1.0037,
       "step": 465
     },
     {
+      "epoch": 8.545454545454545,
+      "grad_norm": 0.419921875,
+      "learning_rate": 1.2615062293021507e-05,
+      "loss": 1.0051,
       "step": 470
     },
     {
+      "epoch": 8.636363636363637,
+      "grad_norm": 0.484375,
+      "learning_rate": 1.1116455134507664e-05,
+      "loss": 0.9905,
       "step": 475
     },
     {
+      "epoch": 8.727272727272727,
+      "grad_norm": 0.44140625,
+      "learning_rate": 9.707346171337894e-06,
+      "loss": 0.9977,
       "step": 480
     },
     {
+      "epoch": 8.818181818181818,
+      "grad_norm": 0.43359375,
+      "learning_rate": 8.38915425679304e-06,
+      "loss": 0.9894,
       "step": 485
     },
     {
+      "epoch": 8.909090909090908,
+      "grad_norm": 0.470703125,
+      "learning_rate": 7.163206698392744e-06,
+      "loss": 1.005,
       "step": 490
     },
     {
+      "epoch": 9.0,
+      "grad_norm": 0.42578125,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.9986,
+      "step": 495
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 2.5596535205841064,
+      "eval_runtime": 0.2334,
+      "eval_samples_per_second": 42.846,
+      "eval_steps_per_second": 4.285,
       "step": 495
     },
     {
+      "epoch": 9.090909090909092,
+      "grad_norm": 0.455078125,
+      "learning_rate": 4.992888225905468e-06,
+      "loss": 0.9957,
       "step": 500
     },
     {
+      "epoch": 9.181818181818182,
+      "grad_norm": 0.47265625,
+      "learning_rate": 4.050702638550275e-06,
+      "loss": 1.0036,
       "step": 505
     },
     {
+      "epoch": 9.272727272727273,
+      "grad_norm": 0.462890625,
+      "learning_rate": 3.2051298603643753e-06,
+      "loss": 0.9985,
       "step": 510
     },
     {
+      "epoch": 9.363636363636363,
+      "grad_norm": 0.458984375,
+      "learning_rate": 2.4570213114592954e-06,
+      "loss": 0.9961,
       "step": 515
     },
     {
+      "epoch": 9.454545454545455,
+      "grad_norm": 0.443359375,
+      "learning_rate": 1.8071302737293295e-06,
+      "loss": 1.0066,
       "step": 520
     },
     {
+      "epoch": 9.545454545454545,
+      "grad_norm": 0.43359375,
+      "learning_rate": 1.2561111323605712e-06,
+      "loss": 0.9996,
       "step": 525
     },
     {
+      "epoch": 9.636363636363637,
+      "grad_norm": 0.439453125,
+      "learning_rate": 8.04518716920466e-07,
+      "loss": 0.9941,
       "step": 530
     },
     {
+      "epoch": 9.727272727272727,
+      "grad_norm": 0.419921875,
+      "learning_rate": 4.5280774269154115e-07,
+      "loss": 0.9898,
       "step": 535
     },
     {
+      "epoch": 9.818181818181818,
+      "grad_norm": 0.419921875,
+      "learning_rate": 2.0133235281156736e-07,
+      "loss": 0.9888,
       "step": 540
     },
     {
+      "epoch": 9.909090909090908,
+      "grad_norm": 0.4453125,
+      "learning_rate": 5.0345761681491746e-08,
+      "loss": 0.9953,
       "step": 545
     },
     {
+      "epoch": 10.0,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0,
+      "loss": 0.9948,
       "step": 550
     },
     {
+      "epoch": 10.0,
+      "eval_loss": 2.5602283477783203,
+      "eval_runtime": 0.2428,
+      "eval_samples_per_second": 41.182,
+      "eval_steps_per_second": 4.118,
+      "step": 550
     },
     {
+      "epoch": 10.0,
+      "step": 550,
+      "total_flos": 1.6777423328808796e+18,
+      "train_loss": 3.202145513187755,
+      "train_runtime": 1331.6624,
+      "train_samples_per_second": 26.313,
+      "train_steps_per_second": 0.413
     }
   ],
   "logging_steps": 5,
+  "max_steps": 550,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 10,
   "save_steps": 100,
+  "total_flos": 1.6777423328808796e+18,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:281bac798d4f46cabe9ee13145f7c00c9bfe2797573fcfc2ff4225f62bc6a512
 size 5176

 version https://git-lfs.github.com/spec/v1
+oid sha256:fd097c126007f600ecfb0e51962f9ceca5bf741f3de46df16b3ca140da5142dc
 size 5176