Model save

Browse files

Files changed (6) hide show

README.md +78 -0
adapter_model.safetensors +1 -1
all_results.json +9 -0
runs/Jun08_17-03-13_user-WS-C621E-SAGE-Series/events.out.tfevents.1717833873.user-WS-C621E-SAGE-Series.13572.0 +2 -2
train_results.json +9 -0
trainer_state.json +2959 -0

README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+---
+license: gemma
+library_name: peft
+tags:
+- trl
+- sft
+- generated_from_trainer
+base_model: google/gemma-7b
+datasets:
+- generator
+model-index:
+- name: gemma7b-summarize-gpt4o-32k
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# gemma7b-summarize-gpt4o-32k
+This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 4.7170
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 2
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 16
+- total_eval_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 10
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 1.0031        | 0.9975 | 203  | 2.5299          |
+| 0.8685        | 2.0    | 407  | 2.4260          |
+| 0.8           | 2.9975 | 610  | 2.5051          |
+| 0.6938        | 4.0    | 814  | 2.6558          |
+| 0.5865        | 4.9975 | 1017 | 2.9163          |
+| 0.4821        | 6.0    | 1221 | 3.3285          |
+| 0.3899        | 6.9975 | 1424 | 3.8984          |
+| 0.3321        | 8.0    | 1628 | 4.4348          |
+| 0.3089        | 8.9975 | 1831 | 4.6891          |
+| 0.3016        | 9.9754 | 2030 | 4.7170          |
+### Framework versions
+- PEFT 0.10.0
+- Transformers 4.40.0
+- Pytorch 2.2.1+cu121
+- Datasets 2.18.0
+- Tokenizers 0.19.1

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:99ef1f4096e2653da59fa0896f5dea80fc03654db430b030ec5f1112abbf40a0
 size 50056096

 version https://git-lfs.github.com/spec/v1
+oid sha256:125f331ca9a8684aec05b7fd7ab37fb0f3ec43d64414a7c4aa9dfc598b180031
 size 50056096

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 9.975429975429975,
+    "total_flos": 1.5518062706111283e+18,
+    "train_loss": 1.3545548074938394,
+    "train_runtime": 13205.4864,
+    "train_samples": 29997,
+    "train_samples_per_second": 2.465,
+    "train_steps_per_second": 0.154
+}

runs/Jun08_17-03-13_user-WS-C621E-SAGE-Series/events.out.tfevents.1717833873.user-WS-C621E-SAGE-Series.13572.0 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:80a7c17b4b9f449f2c80b040bbce379bac47f9e715e9c0863d9ce774c3b1ec4e
-size 92378

 version https://git-lfs.github.com/spec/v1
+oid sha256:c5379c8320c1ec38d3a342189d2539ae48a02598f593cdad40d5c6748b67dbf2
+size 94269

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 9.975429975429975,
+    "total_flos": 1.5518062706111283e+18,
+    "train_loss": 1.3545548074938394,
+    "train_runtime": 13205.4864,
+    "train_samples": 29997,
+    "train_samples_per_second": 2.465,
+    "train_steps_per_second": 0.154
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2959 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 9.975429975429975,
+  "eval_steps": 500,
+  "global_step": 2030,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.004914004914004914,
+      "grad_norm": 552.0,
+      "learning_rate": 9.852216748768474e-07,
+      "loss": 51.7129,
+      "step": 1
+    },
+    {
+      "epoch": 0.02457002457002457,
+      "grad_norm": 404.0,
+      "learning_rate": 4.926108374384237e-06,
+      "loss": 45.7069,
+      "step": 5
+    },
+    {
+      "epoch": 0.04914004914004914,
+      "grad_norm": 388.0,
+      "learning_rate": 9.852216748768475e-06,
+      "loss": 46.9702,
+      "step": 10
+    },
+    {
+      "epoch": 0.07371007371007371,
+      "grad_norm": 292.0,
+      "learning_rate": 1.4778325123152711e-05,
+      "loss": 37.466,
+      "step": 15
+    },
+    {
+      "epoch": 0.09828009828009827,
+      "grad_norm": 79.5,
+      "learning_rate": 1.970443349753695e-05,
+      "loss": 28.4521,
+      "step": 20
+    },
+    {
+      "epoch": 0.12285012285012285,
+      "grad_norm": 47.25,
+      "learning_rate": 2.4630541871921184e-05,
+      "loss": 23.8617,
+      "step": 25
+    },
+    {
+      "epoch": 0.14742014742014742,
+      "grad_norm": 26.625,
+      "learning_rate": 2.9556650246305422e-05,
+      "loss": 22.4994,
+      "step": 30
+    },
+    {
+      "epoch": 0.171990171990172,
+      "grad_norm": 12.625,
+      "learning_rate": 3.4482758620689657e-05,
+      "loss": 20.8855,
+      "step": 35
+    },
+    {
+      "epoch": 0.19656019656019655,
+      "grad_norm": 7.84375,
+      "learning_rate": 3.94088669950739e-05,
+      "loss": 19.2048,
+      "step": 40
+    },
+    {
+      "epoch": 0.22113022113022113,
+      "grad_norm": 8.5,
+      "learning_rate": 4.433497536945813e-05,
+      "loss": 18.9068,
+      "step": 45
+    },
+    {
+      "epoch": 0.2457002457002457,
+      "grad_norm": 12.625,
+      "learning_rate": 4.926108374384237e-05,
+      "loss": 17.7765,
+      "step": 50
+    },
+    {
+      "epoch": 0.2702702702702703,
+      "grad_norm": 24.875,
+      "learning_rate": 5.41871921182266e-05,
+      "loss": 16.5829,
+      "step": 55
+    },
+    {
+      "epoch": 0.29484029484029484,
+      "grad_norm": 76.5,
+      "learning_rate": 5.9113300492610844e-05,
+      "loss": 13.7055,
+      "step": 60
+    },
+    {
+      "epoch": 0.3194103194103194,
+      "grad_norm": 52.25,
+      "learning_rate": 6.403940886699507e-05,
+      "loss": 7.5296,
+      "step": 65
+    },
+    {
+      "epoch": 0.343980343980344,
+      "grad_norm": 11.9375,
+      "learning_rate": 6.896551724137931e-05,
+      "loss": 2.7412,
+      "step": 70
+    },
+    {
+      "epoch": 0.36855036855036855,
+      "grad_norm": 2.984375,
+      "learning_rate": 7.389162561576355e-05,
+      "loss": 2.1519,
+      "step": 75
+    },
+    {
+      "epoch": 0.3931203931203931,
+      "grad_norm": 2.625,
+      "learning_rate": 7.88177339901478e-05,
+      "loss": 1.849,
+      "step": 80
+    },
+    {
+      "epoch": 0.4176904176904177,
+      "grad_norm": 3.21875,
+      "learning_rate": 8.374384236453202e-05,
+      "loss": 1.6847,
+      "step": 85
+    },
+    {
+      "epoch": 0.44226044226044225,
+      "grad_norm": 2.5625,
+      "learning_rate": 8.866995073891627e-05,
+      "loss": 1.528,
+      "step": 90
+    },
+    {
+      "epoch": 0.4668304668304668,
+      "grad_norm": 5.65625,
+      "learning_rate": 9.35960591133005e-05,
+      "loss": 1.4458,
+      "step": 95
+    },
+    {
+      "epoch": 0.4914004914004914,
+      "grad_norm": 1.96875,
+      "learning_rate": 9.852216748768474e-05,
+      "loss": 1.3912,
+      "step": 100
+    },
+    {
+      "epoch": 0.515970515970516,
+      "grad_norm": 4.34375,
+      "learning_rate": 0.00010344827586206898,
+      "loss": 1.3435,
+      "step": 105
+    },
+    {
+      "epoch": 0.5405405405405406,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0001083743842364532,
+      "loss": 1.2916,
+      "step": 110
+    },
+    {
+      "epoch": 0.5651105651105651,
+      "grad_norm": 2.828125,
+      "learning_rate": 0.00011330049261083743,
+      "loss": 1.268,
+      "step": 115
+    },
+    {
+      "epoch": 0.5896805896805897,
+      "grad_norm": 8.75,
+      "learning_rate": 0.00011822660098522169,
+      "loss": 1.2419,
+      "step": 120
+    },
+    {
+      "epoch": 0.6142506142506142,
+      "grad_norm": 9.875,
+      "learning_rate": 0.00012315270935960593,
+      "loss": 1.1848,
+      "step": 125
+    },
+    {
+      "epoch": 0.6388206388206388,
+      "grad_norm": 2.859375,
+      "learning_rate": 0.00012807881773399014,
+      "loss": 1.1821,
+      "step": 130
+    },
+    {
+      "epoch": 0.6633906633906634,
+      "grad_norm": 2.203125,
+      "learning_rate": 0.00013300492610837438,
+      "loss": 1.1308,
+      "step": 135
+    },
+    {
+      "epoch": 0.687960687960688,
+      "grad_norm": 3.453125,
+      "learning_rate": 0.00013793103448275863,
+      "loss": 1.1049,
+      "step": 140
+    },
+    {
+      "epoch": 0.7125307125307125,
+      "grad_norm": 5.53125,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 1.0904,
+      "step": 145
+    },
+    {
+      "epoch": 0.7371007371007371,
+      "grad_norm": 36.75,
+      "learning_rate": 0.0001477832512315271,
+      "loss": 1.0884,
+      "step": 150
+    },
+    {
+      "epoch": 0.7616707616707616,
+      "grad_norm": 2.1875,
+      "learning_rate": 0.00015270935960591132,
+      "loss": 1.0498,
+      "step": 155
+    },
+    {
+      "epoch": 0.7862407862407862,
+      "grad_norm": 3.734375,
+      "learning_rate": 0.0001576354679802956,
+      "loss": 1.0656,
+      "step": 160
+    },
+    {
+      "epoch": 0.8108108108108109,
+      "grad_norm": 2.71875,
+      "learning_rate": 0.0001625615763546798,
+      "loss": 1.0436,
+      "step": 165
+    },
+    {
+      "epoch": 0.8353808353808354,
+      "grad_norm": 3.03125,
+      "learning_rate": 0.00016748768472906405,
+      "loss": 1.0507,
+      "step": 170
+    },
+    {
+      "epoch": 0.85995085995086,
+      "grad_norm": 2.109375,
+      "learning_rate": 0.00017241379310344826,
+      "loss": 1.0593,
+      "step": 175
+    },
+    {
+      "epoch": 0.8845208845208845,
+      "grad_norm": 28.875,
+      "learning_rate": 0.00017733990147783253,
+      "loss": 1.0432,
+      "step": 180
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.00018226600985221675,
+      "loss": 1.0558,
+      "step": 185
+    },
+    {
+      "epoch": 0.9336609336609336,
+      "grad_norm": 3.796875,
+      "learning_rate": 0.000187192118226601,
+      "loss": 1.0305,
+      "step": 190
+    },
+    {
+      "epoch": 0.9582309582309583,
+      "grad_norm": 1.9140625,
+      "learning_rate": 0.00019211822660098523,
+      "loss": 1.0315,
+      "step": 195
+    },
+    {
+      "epoch": 0.9828009828009828,
+      "grad_norm": 3.28125,
+      "learning_rate": 0.00019704433497536947,
+      "loss": 1.0031,
+      "step": 200
+    },
+    {
+      "epoch": 0.9975429975429976,
+      "eval_loss": 2.529916286468506,
+      "eval_runtime": 2.0529,
+      "eval_samples_per_second": 4.871,
+      "eval_steps_per_second": 0.974,
+      "step": 203
+    },
+    {
+      "epoch": 1.0073710073710074,
+      "grad_norm": 9.4375,
+      "learning_rate": 0.00019999940863962815,
+      "loss": 1.0212,
+      "step": 205
+    },
+    {
+      "epoch": 1.031941031941032,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.00019999275591576766,
+      "loss": 0.9729,
+      "step": 210
+    },
+    {
+      "epoch": 1.0565110565110565,
+      "grad_norm": 5.875,
+      "learning_rate": 0.00019997871176098827,
+      "loss": 0.9684,
+      "step": 215
+    },
+    {
+      "epoch": 1.0810810810810811,
+      "grad_norm": 3.328125,
+      "learning_rate": 0.00019995727721342914,
+      "loss": 0.9533,
+      "step": 220
+    },
+    {
+      "epoch": 1.1056511056511056,
+      "grad_norm": 1.1953125,
+      "learning_rate": 0.00019992845385752485,
+      "loss": 0.9418,
+      "step": 225
+    },
+    {
+      "epoch": 1.1302211302211302,
+      "grad_norm": 2.3125,
+      "learning_rate": 0.00019989224382388813,
+      "loss": 0.9365,
+      "step": 230
+    },
+    {
+      "epoch": 1.154791154791155,
+      "grad_norm": 27.75,
+      "learning_rate": 0.00019984864978915253,
+      "loss": 0.9535,
+      "step": 235
+    },
+    {
+      "epoch": 1.1793611793611793,
+      "grad_norm": 3.03125,
+      "learning_rate": 0.00019979767497577445,
+      "loss": 0.9555,
+      "step": 240
+    },
+    {
+      "epoch": 1.203931203931204,
+      "grad_norm": 3.34375,
+      "learning_rate": 0.000199739323151795,
+      "loss": 0.9599,
+      "step": 245
+    },
+    {
+      "epoch": 1.2285012285012284,
+      "grad_norm": 1.8671875,
+      "learning_rate": 0.00019967359863056134,
+      "loss": 0.9268,
+      "step": 250
+    },
+    {
+      "epoch": 1.253071253071253,
+      "grad_norm": 1.96875,
+      "learning_rate": 0.00019960050627040806,
+      "loss": 0.9183,
+      "step": 255
+    },
+    {
+      "epoch": 1.2776412776412776,
+      "grad_norm": 8.125,
+      "learning_rate": 0.0001995200514742978,
+      "loss": 0.9041,
+      "step": 260
+    },
+    {
+      "epoch": 1.3022113022113022,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0001994322401894221,
+      "loss": 0.915,
+      "step": 265
+    },
+    {
+      "epoch": 1.3267813267813269,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00019933707890676158,
+      "loss": 0.9009,
+      "step": 270
+    },
+    {
+      "epoch": 1.3513513513513513,
+      "grad_norm": 3.515625,
+      "learning_rate": 0.00019923457466060636,
+      "loss": 0.918,
+      "step": 275
+    },
+    {
+      "epoch": 1.375921375921376,
+      "grad_norm": 1.1640625,
+      "learning_rate": 0.00019912473502803582,
+      "loss": 0.9329,
+      "step": 280
+    },
+    {
+      "epoch": 1.4004914004914004,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0001990075681283587,
+      "loss": 0.9269,
+      "step": 285
+    },
+    {
+      "epoch": 1.425061425061425,
+      "grad_norm": 4.875,
+      "learning_rate": 0.00019888308262251285,
+      "loss": 0.9245,
+      "step": 290
+    },
+    {
+      "epoch": 1.4496314496314495,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00019875128771242506,
+      "loss": 0.9252,
+      "step": 295
+    },
+    {
+      "epoch": 1.4742014742014742,
+      "grad_norm": 2.203125,
+      "learning_rate": 0.00019861219314033077,
+      "loss": 0.9055,
+      "step": 300
+    },
+    {
+      "epoch": 1.4987714987714988,
+      "grad_norm": 2.640625,
+      "learning_rate": 0.000198465809188054,
+      "loss": 0.9095,
+      "step": 305
+    },
+    {
+      "epoch": 1.5233415233415233,
+      "grad_norm": 1.7421875,
+      "learning_rate": 0.0001983121466762474,
+      "loss": 0.9061,
+      "step": 310
+    },
+    {
+      "epoch": 1.547911547911548,
+      "grad_norm": 3.671875,
+      "learning_rate": 0.00019815121696359212,
+      "loss": 0.8923,
+      "step": 315
+    },
+    {
+      "epoch": 1.5724815724815726,
+      "grad_norm": 2.875,
+      "learning_rate": 0.00019798303194595846,
+      "loss": 0.9021,
+      "step": 320
+    },
+    {
+      "epoch": 1.597051597051597,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00019780760405552645,
+      "loss": 0.8921,
+      "step": 325
+    },
+    {
+      "epoch": 1.6216216216216215,
+      "grad_norm": 1.09375,
+      "learning_rate": 0.00019762494625986677,
+      "loss": 0.8885,
+      "step": 330
+    },
+    {
+      "epoch": 1.6461916461916462,
+      "grad_norm": 2.671875,
+      "learning_rate": 0.00019743507206098233,
+      "loss": 0.8895,
+      "step": 335
+    },
+    {
+      "epoch": 1.6707616707616708,
+      "grad_norm": 0.7265625,
+      "learning_rate": 0.00019723799549431007,
+      "loss": 0.8976,
+      "step": 340
+    },
+    {
+      "epoch": 1.6953316953316953,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00019703373112768365,
+      "loss": 0.8836,
+      "step": 345
+    },
+    {
+      "epoch": 1.71990171990172,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00019682229406025635,
+      "loss": 0.8801,
+      "step": 350
+    },
+    {
+      "epoch": 1.7444717444717446,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.00019660369992138517,
+      "loss": 0.9048,
+      "step": 355
+    },
+    {
+      "epoch": 1.769041769041769,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.0001963779648694754,
+      "loss": 0.8906,
+      "step": 360
+    },
+    {
+      "epoch": 1.7936117936117935,
+      "grad_norm": 1.3203125,
+      "learning_rate": 0.00019614510559078625,
+      "loss": 0.8805,
+      "step": 365
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.00019590513929819734,
+      "loss": 0.8839,
+      "step": 370
+    },
+    {
+      "epoch": 1.8427518427518428,
+      "grad_norm": 1.2109375,
+      "learning_rate": 0.0001956580837299364,
+      "loss": 0.8715,
+      "step": 375
+    },
+    {
+      "epoch": 1.8673218673218672,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.000195403957148268,
+      "loss": 0.8952,
+      "step": 380
+    },
+    {
+      "epoch": 1.8918918918918919,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0001951427783381437,
+      "loss": 0.8585,
+      "step": 385
+    },
+    {
+      "epoch": 1.9164619164619165,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.0001948745666058134,
+      "loss": 0.8868,
+      "step": 390
+    },
+    {
+      "epoch": 1.941031941031941,
+      "grad_norm": 2.734375,
+      "learning_rate": 0.00019459934177739813,
+      "loss": 0.8778,
+      "step": 395
+    },
+    {
+      "epoch": 1.9656019656019657,
+      "grad_norm": 1.8359375,
+      "learning_rate": 0.00019431712419742484,
+      "loss": 0.8902,
+      "step": 400
+    },
+    {
+      "epoch": 1.9901719901719903,
+      "grad_norm": 3.078125,
+      "learning_rate": 0.00019402793472732217,
+      "loss": 0.8685,
+      "step": 405
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 2.4260478019714355,
+      "eval_runtime": 2.0431,
+      "eval_samples_per_second": 4.895,
+      "eval_steps_per_second": 0.979,
+      "step": 407
+    },
+    {
+      "epoch": 2.0147420147420148,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00019373179474387858,
+      "loss": 0.8187,
+      "step": 410
+    },
+    {
+      "epoch": 2.039312039312039,
+      "grad_norm": 0.9140625,
+      "learning_rate": 0.0001934287261376622,
+      "loss": 0.7808,
+      "step": 415
+    },
+    {
+      "epoch": 2.063882063882064,
+      "grad_norm": 0.73828125,
+      "learning_rate": 0.00019311875131140246,
+      "loss": 0.7746,
+      "step": 420
+    },
+    {
+      "epoch": 2.0884520884520885,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.00019280189317833445,
+      "loss": 0.7761,
+      "step": 425
+    },
+    {
+      "epoch": 2.113022113022113,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.00019247817516050483,
+      "loss": 0.7781,
+      "step": 430
+    },
+    {
+      "epoch": 2.1375921375921374,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.00019214762118704076,
+      "loss": 0.7648,
+      "step": 435
+    },
+    {
+      "epoch": 2.1621621621621623,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.0001918102556923809,
+      "loss": 0.7926,
+      "step": 440
+    },
+    {
+      "epoch": 2.1867321867321867,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.0001914661036144692,
+      "loss": 0.7821,
+      "step": 445
+    },
+    {
+      "epoch": 2.211302211302211,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00019111519039291167,
+      "loss": 0.7807,
+      "step": 450
+    },
+    {
+      "epoch": 2.235872235872236,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00019075754196709572,
+      "loss": 0.7994,
+      "step": 455
+    },
+    {
+      "epoch": 2.2604422604422605,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.0001903931847742728,
+      "loss": 0.7777,
+      "step": 460
+    },
+    {
+      "epoch": 2.285012285012285,
+      "grad_norm": 1.8203125,
+      "learning_rate": 0.00019002214574760423,
+      "loss": 0.7781,
+      "step": 465
+    },
+    {
+      "epoch": 2.30958230958231,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.0001896444523141701,
+      "loss": 0.7977,
+      "step": 470
+    },
+    {
+      "epoch": 2.3341523341523343,
+      "grad_norm": 0.73046875,
+      "learning_rate": 0.00018926013239294216,
+      "loss": 0.7758,
+      "step": 475
+    },
+    {
+      "epoch": 2.3587223587223587,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00018886921439271984,
+      "loss": 0.8019,
+      "step": 480
+    },
+    {
+      "epoch": 2.383292383292383,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00018847172721003043,
+      "loss": 0.7829,
+      "step": 485
+    },
+    {
+      "epoch": 2.407862407862408,
+      "grad_norm": 0.984375,
+      "learning_rate": 0.00018806770022699278,
+      "loss": 0.7759,
+      "step": 490
+    },
+    {
+      "epoch": 2.4324324324324325,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.0001876571633091458,
+      "loss": 0.776,
+      "step": 495
+    },
+    {
+      "epoch": 2.457002457002457,
+      "grad_norm": 1.1875,
+      "learning_rate": 0.00018724014680324057,
+      "loss": 0.7885,
+      "step": 500
+    },
+    {
+      "epoch": 2.4815724815724813,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00018681668153499697,
+      "loss": 0.7929,
+      "step": 505
+    },
+    {
+      "epoch": 2.506142506142506,
+      "grad_norm": 4.25,
+      "learning_rate": 0.00018638679880682543,
+      "loss": 0.7835,
+      "step": 510
+    },
+    {
+      "epoch": 2.5307125307125307,
+      "grad_norm": 1.109375,
+      "learning_rate": 0.00018595053039551274,
+      "loss": 0.7981,
+      "step": 515
+    },
+    {
+      "epoch": 2.555282555282555,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.00018550790854987323,
+      "loss": 0.7831,
+      "step": 520
+    },
+    {
+      "epoch": 2.57985257985258,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00018505896598836508,
+      "loss": 0.7863,
+      "step": 525
+    },
+    {
+      "epoch": 2.6044226044226044,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.00018460373589667154,
+      "loss": 0.7929,
+      "step": 530
+    },
+    {
+      "epoch": 2.628992628992629,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.00018414225192524806,
+      "loss": 0.7722,
+      "step": 535
+    },
+    {
+      "epoch": 2.6535626535626538,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00018367454818683473,
+      "loss": 0.7832,
+      "step": 540
+    },
+    {
+      "epoch": 2.678132678132678,
+      "grad_norm": 1.2890625,
+      "learning_rate": 0.00018320065925393468,
+      "loss": 0.7746,
+      "step": 545
+    },
+    {
+      "epoch": 2.7027027027027026,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.00018272062015625872,
+      "loss": 0.7826,
+      "step": 550
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.0001822344663781356,
+      "loss": 0.7946,
+      "step": 555
+    },
+    {
+      "epoch": 2.751842751842752,
+      "grad_norm": 1.0234375,
+      "learning_rate": 0.00018174223385588917,
+      "loss": 0.7849,
+      "step": 560
+    },
+    {
+      "epoch": 2.7764127764127764,
+      "grad_norm": 10.3125,
+      "learning_rate": 0.00018124395897518224,
+      "loss": 0.7783,
+      "step": 565
+    },
+    {
+      "epoch": 2.800982800982801,
+      "grad_norm": 0.8515625,
+      "learning_rate": 0.0001807396785683264,
+      "loss": 0.8005,
+      "step": 570
+    },
+    {
+      "epoch": 2.8255528255528253,
+      "grad_norm": 0.83984375,
+      "learning_rate": 0.00018022942991156,
+      "loss": 0.7981,
+      "step": 575
+    },
+    {
+      "epoch": 2.85012285012285,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00017971325072229226,
+      "loss": 0.7887,
+      "step": 580
+    },
+    {
+      "epoch": 2.8746928746928746,
+      "grad_norm": 0.8125,
+      "learning_rate": 0.0001791911791563154,
+      "loss": 0.8013,
+      "step": 585
+    },
+    {
+      "epoch": 2.899262899262899,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.00017866325380498416,
+      "loss": 0.777,
+      "step": 590
+    },
+    {
+      "epoch": 2.923832923832924,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00017812951369236316,
+      "loss": 0.7992,
+      "step": 595
+    },
+    {
+      "epoch": 2.9484029484029484,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.00017758999827234212,
+      "loss": 0.7856,
+      "step": 600
+    },
+    {
+      "epoch": 2.972972972972973,
+      "grad_norm": 3.109375,
+      "learning_rate": 0.00017704474742571969,
+      "loss": 0.787,
+      "step": 605
+    },
+    {
+      "epoch": 2.9975429975429977,
+      "grad_norm": 3.8125,
+      "learning_rate": 0.00017649380145725517,
+      "loss": 0.8,
+      "step": 610
+    },
+    {
+      "epoch": 2.9975429975429977,
+      "eval_loss": 2.5051403045654297,
+      "eval_runtime": 2.0509,
+      "eval_samples_per_second": 4.876,
+      "eval_steps_per_second": 0.975,
+      "step": 610
+    },
+    {
+      "epoch": 3.022113022113022,
+      "grad_norm": 1.7890625,
+      "learning_rate": 0.00017593720109268944,
+      "loss": 0.6916,
+      "step": 615
+    },
+    {
+      "epoch": 3.0466830466830466,
+      "grad_norm": 9.4375,
+      "learning_rate": 0.00017537498747573443,
+      "loss": 0.6614,
+      "step": 620
+    },
+    {
+      "epoch": 3.0712530712530715,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00017480720216503183,
+      "loss": 0.6639,
+      "step": 625
+    },
+    {
+      "epoch": 3.095823095823096,
+      "grad_norm": 0.74609375,
+      "learning_rate": 0.000174233887131081,
+      "loss": 0.6579,
+      "step": 630
+    },
+    {
+      "epoch": 3.1203931203931203,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.0001736550847531366,
+      "loss": 0.6546,
+      "step": 635
+    },
+    {
+      "epoch": 3.1449631449631448,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.00017307083781607595,
+      "loss": 0.6731,
+      "step": 640
+    },
+    {
+      "epoch": 3.1695331695331697,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00017248118950723634,
+      "loss": 0.6761,
+      "step": 645
+    },
+    {
+      "epoch": 3.194103194103194,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00017188618341322254,
+      "loss": 0.6761,
+      "step": 650
+    },
+    {
+      "epoch": 3.2186732186732185,
+      "grad_norm": 1.7421875,
+      "learning_rate": 0.00017128586351668524,
+      "loss": 0.666,
+      "step": 655
+    },
+    {
+      "epoch": 3.2432432432432434,
+      "grad_norm": 1.3515625,
+      "learning_rate": 0.00017068027419306936,
+      "loss": 0.6677,
+      "step": 660
+    },
+    {
+      "epoch": 3.267813267813268,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00017006946020733425,
+      "loss": 0.6663,
+      "step": 665
+    },
+    {
+      "epoch": 3.2923832923832923,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.00016945346671064452,
+      "loss": 0.6762,
+      "step": 670
+    },
+    {
+      "epoch": 3.3169533169533167,
+      "grad_norm": 0.7578125,
+      "learning_rate": 0.00016883233923703248,
+      "loss": 0.6842,
+      "step": 675
+    },
+    {
+      "epoch": 3.3415233415233416,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.00016820612370003221,
+      "loss": 0.6756,
+      "step": 680
+    },
+    {
+      "epoch": 3.366093366093366,
+      "grad_norm": 1.015625,
+      "learning_rate": 0.00016757486638928587,
+      "loss": 0.6757,
+      "step": 685
+    },
+    {
+      "epoch": 3.3906633906633905,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.00016693861396712168,
+      "loss": 0.6971,
+      "step": 690
+    },
+    {
+      "epoch": 3.4152334152334154,
+      "grad_norm": 1.125,
+      "learning_rate": 0.00016629741346510496,
+      "loss": 0.6837,
+      "step": 695
+    },
+    {
+      "epoch": 3.43980343980344,
+      "grad_norm": 2.6875,
+      "learning_rate": 0.00016565131228056133,
+      "loss": 0.6836,
+      "step": 700
+    },
+    {
+      "epoch": 3.4643734643734643,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.00016500035817307334,
+      "loss": 0.6719,
+      "step": 705
+    },
+    {
+      "epoch": 3.488943488943489,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0001643445992609498,
+      "loss": 0.6722,
+      "step": 710
+    },
+    {
+      "epoch": 3.5135135135135136,
+      "grad_norm": 0.82421875,
+      "learning_rate": 0.00016368408401766916,
+      "loss": 0.6843,
+      "step": 715
+    },
+    {
+      "epoch": 3.538083538083538,
+      "grad_norm": 0.85546875,
+      "learning_rate": 0.0001630188612682963,
+      "loss": 0.6787,
+      "step": 720
+    },
+    {
+      "epoch": 3.562653562653563,
+      "grad_norm": 0.71875,
+      "learning_rate": 0.00016234898018587337,
+      "loss": 0.6639,
+      "step": 725
+    },
+    {
+      "epoch": 3.5872235872235874,
+      "grad_norm": 0.96484375,
+      "learning_rate": 0.00016167449028778484,
+      "loss": 0.6951,
+      "step": 730
+    },
+    {
+      "epoch": 3.611793611793612,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0001609954414320973,
+      "loss": 0.6881,
+      "step": 735
+    },
+    {
+      "epoch": 3.6363636363636362,
+      "grad_norm": 0.703125,
+      "learning_rate": 0.0001603118838138741,
+      "loss": 0.6761,
+      "step": 740
+    },
+    {
+      "epoch": 3.6609336609336607,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00015962386796146462,
+      "loss": 0.69,
+      "step": 745
+    },
+    {
+      "epoch": 3.6855036855036856,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.00015893144473276953,
+      "loss": 0.691,
+      "step": 750
+    },
+    {
+      "epoch": 3.71007371007371,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00015823466531148124,
+      "loss": 0.6843,
+      "step": 755
+    },
+    {
+      "epoch": 3.7346437346437344,
+      "grad_norm": 0.765625,
+      "learning_rate": 0.00015753358120330042,
+      "loss": 0.7094,
+      "step": 760
+    },
+    {
+      "epoch": 3.7592137592137593,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00015682824423212877,
+      "loss": 0.6892,
+      "step": 765
+    },
+    {
+      "epoch": 3.7837837837837838,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.00015611870653623825,
+      "loss": 0.6929,
+      "step": 770
+    },
+    {
+      "epoch": 3.808353808353808,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.00015540502056441688,
+      "loss": 0.7022,
+      "step": 775
+    },
+    {
+      "epoch": 3.832923832923833,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00015468723907209193,
+      "loss": 0.703,
+      "step": 780
+    },
+    {
+      "epoch": 3.8574938574938575,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.00015396541511743012,
+      "loss": 0.7027,
+      "step": 785
+    },
+    {
+      "epoch": 3.882063882063882,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00015323960205741561,
+      "loss": 0.6829,
+      "step": 790
+    },
+    {
+      "epoch": 3.906633906633907,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.00015250985354390596,
+      "loss": 0.6945,
+      "step": 795
+    },
+    {
+      "epoch": 3.9312039312039313,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.0001517762235196661,
+      "loss": 0.6966,
+      "step": 800
+    },
+    {
+      "epoch": 3.9557739557739557,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.00015103876621438086,
+      "loss": 0.6947,
+      "step": 805
+    },
+    {
+      "epoch": 3.98034398034398,
+      "grad_norm": 0.89453125,
+      "learning_rate": 0.00015029753614064645,
+      "loss": 0.6938,
+      "step": 810
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 2.6558120250701904,
+      "eval_runtime": 2.0446,
+      "eval_samples_per_second": 4.891,
+      "eval_steps_per_second": 0.978,
+      "step": 814
+    },
+    {
+      "epoch": 4.004914004914005,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.00014955258808994096,
+      "loss": 0.6676,
+      "step": 815
+    },
+    {
+      "epoch": 4.0294840294840295,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00014880397712857386,
+      "loss": 0.5659,
+      "step": 820
+    },
+    {
+      "epoch": 4.054054054054054,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.00014805175859361594,
+      "loss": 0.5525,
+      "step": 825
+    },
+    {
+      "epoch": 4.078624078624078,
+      "grad_norm": 1.328125,
+      "learning_rate": 0.00014729598808880861,
+      "loss": 0.5546,
+      "step": 830
+    },
+    {
+      "epoch": 4.103194103194103,
+      "grad_norm": 0.96875,
+      "learning_rate": 0.00014653672148045357,
+      "loss": 0.5665,
+      "step": 835
+    },
+    {
+      "epoch": 4.127764127764128,
+      "grad_norm": 1.0078125,
+      "learning_rate": 0.00014577401489328335,
+      "loss": 0.565,
+      "step": 840
+    },
+    {
+      "epoch": 4.152334152334152,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.0001450079247063127,
+      "loss": 0.5764,
+      "step": 845
+    },
+    {
+      "epoch": 4.176904176904177,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00014423850754867075,
+      "loss": 0.5565,
+      "step": 850
+    },
+    {
+      "epoch": 4.201474201474202,
+      "grad_norm": 0.921875,
+      "learning_rate": 0.0001434658202954153,
+      "loss": 0.5572,
+      "step": 855
+    },
+    {
+      "epoch": 4.226044226044226,
+      "grad_norm": 2.0,
+      "learning_rate": 0.00014268992006332846,
+      "loss": 0.5719,
+      "step": 860
+    },
+    {
+      "epoch": 4.250614250614251,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.0001419108642066947,
+      "loss": 0.5644,
+      "step": 865
+    },
+    {
+      "epoch": 4.275184275184275,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00014112871031306119,
+      "loss": 0.5785,
+      "step": 870
+    },
+    {
+      "epoch": 4.2997542997543,
+      "grad_norm": 1.734375,
+      "learning_rate": 0.00014034351619898088,
+      "loss": 0.5825,
+      "step": 875
+    },
+    {
+      "epoch": 4.324324324324325,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.00013955533990573886,
+      "loss": 0.5752,
+      "step": 880
+    },
+    {
+      "epoch": 4.348894348894349,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00013876423969506194,
+      "loss": 0.5863,
+      "step": 885
+    },
+    {
+      "epoch": 4.3734643734643734,
+      "grad_norm": 1.7421875,
+      "learning_rate": 0.00013797027404481184,
+      "loss": 0.5826,
+      "step": 890
+    },
+    {
+      "epoch": 4.398034398034398,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.0001371735016446627,
+      "loss": 0.576,
+      "step": 895
+    },
+    {
+      "epoch": 4.422604422604422,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00013637398139176255,
+      "loss": 0.577,
+      "step": 900
+    },
+    {
+      "epoch": 4.447174447174447,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00013557177238637986,
+      "loss": 0.5832,
+      "step": 905
+    },
+    {
+      "epoch": 4.471744471744472,
+      "grad_norm": 0.9375,
+      "learning_rate": 0.00013476693392753476,
+      "loss": 0.5856,
+      "step": 910
+    },
+    {
+      "epoch": 4.496314496314496,
+      "grad_norm": 1.1328125,
+      "learning_rate": 0.00013395952550861572,
+      "loss": 0.592,
+      "step": 915
+    },
+    {
+      "epoch": 4.520884520884521,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.00013314960681298175,
+      "loss": 0.5861,
+      "step": 920
+    },
+    {
+      "epoch": 4.545454545454545,
+      "grad_norm": 0.97265625,
+      "learning_rate": 0.0001323372377095507,
+      "loss": 0.5814,
+      "step": 925
+    },
+    {
+      "epoch": 4.57002457002457,
+      "grad_norm": 0.91796875,
+      "learning_rate": 0.0001315224782483737,
+      "loss": 0.5847,
+      "step": 930
+    },
+    {
+      "epoch": 4.594594594594595,
+      "grad_norm": 0.8203125,
+      "learning_rate": 0.00013070538865619642,
+      "loss": 0.5773,
+      "step": 935
+    },
+    {
+      "epoch": 4.61916461916462,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.00012988602933200689,
+      "loss": 0.5723,
+      "step": 940
+    },
+    {
+      "epoch": 4.643734643734644,
+      "grad_norm": 1.1171875,
+      "learning_rate": 0.0001290644608425711,
+      "loss": 0.5918,
+      "step": 945
+    },
+    {
+      "epoch": 4.6683046683046685,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.0001282407439179557,
+      "loss": 0.5932,
+      "step": 950
+    },
+    {
+      "epoch": 4.6928746928746925,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.00012741493944703905,
+      "loss": 0.5868,
+      "step": 955
+    },
+    {
+      "epoch": 4.717444717444717,
+      "grad_norm": 0.83203125,
+      "learning_rate": 0.0001265871084730101,
+      "loss": 0.5944,
+      "step": 960
+    },
+    {
+      "epoch": 4.742014742014742,
+      "grad_norm": 0.9765625,
+      "learning_rate": 0.00012575731218885625,
+      "loss": 0.5893,
+      "step": 965
+    },
+    {
+      "epoch": 4.766584766584766,
+      "grad_norm": 1.140625,
+      "learning_rate": 0.00012492561193284008,
+      "loss": 0.5791,
+      "step": 970
+    },
+    {
+      "epoch": 4.791154791154791,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.00012409206918396503,
+      "loss": 0.5876,
+      "step": 975
+    },
+    {
+      "epoch": 4.815724815724816,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.00012325674555743106,
+      "loss": 0.5871,
+      "step": 980
+    },
+    {
+      "epoch": 4.84029484029484,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0001224197028000799,
+      "loss": 0.5987,
+      "step": 985
+    },
+    {
+      "epoch": 4.864864864864865,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.000121581002785831,
+      "loss": 0.5857,
+      "step": 990
+    },
+    {
+      "epoch": 4.88943488943489,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.00012074070751110751,
+      "loss": 0.5932,
+      "step": 995
+    },
+    {
+      "epoch": 4.914004914004914,
+      "grad_norm": 1.0625,
+      "learning_rate": 0.00011989887909025388,
+      "loss": 0.5891,
+      "step": 1000
+    },
+    {
+      "epoch": 4.938574938574939,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.00011905557975094406,
+      "loss": 0.5881,
+      "step": 1005
+    },
+    {
+      "epoch": 4.963144963144963,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.00011821087182958186,
+      "loss": 0.5845,
+      "step": 1010
+    },
+    {
+      "epoch": 4.987714987714988,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.5865,
+      "step": 1015
+    },
+    {
+      "epoch": 4.997542997542998,
+      "eval_loss": 2.9162657260894775,
+      "eval_runtime": 2.0535,
+      "eval_samples_per_second": 4.87,
+      "eval_steps_per_second": 0.974,
+      "step": 1017
+    },
+    {
+      "epoch": 5.012285012285012,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0001165174801023096,
+      "loss": 0.5388,
+      "step": 1020
+    },
+    {
+      "epoch": 5.036855036855036,
+      "grad_norm": 0.953125,
+      "learning_rate": 0.00011566892147134705,
+      "loss": 0.4596,
+      "step": 1025
+    },
+    {
+      "epoch": 5.061425061425061,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.00011481920459897417,
+      "loss": 0.4681,
+      "step": 1030
+    },
+    {
+      "epoch": 5.085995085995086,
+      "grad_norm": 0.8828125,
+      "learning_rate": 0.00011396839229597674,
+      "loss": 0.4716,
+      "step": 1035
+    },
+    {
+      "epoch": 5.11056511056511,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.00011311654745411425,
+      "loss": 0.4667,
+      "step": 1040
+    },
+    {
+      "epoch": 5.135135135135135,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.00011226373304147123,
+      "loss": 0.465,
+      "step": 1045
+    },
+    {
+      "epoch": 5.15970515970516,
+      "grad_norm": 0.828125,
+      "learning_rate": 0.00011141001209780249,
+      "loss": 0.4751,
+      "step": 1050
+    },
+    {
+      "epoch": 5.184275184275184,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00011055544772987335,
+      "loss": 0.4622,
+      "step": 1055
+    },
+    {
+      "epoch": 5.208845208845209,
+      "grad_norm": 1.234375,
+      "learning_rate": 0.0001097001031067947,
+      "loss": 0.4839,
+      "step": 1060
+    },
+    {
+      "epoch": 5.233415233415234,
+      "grad_norm": 0.91015625,
+      "learning_rate": 0.00010884404145535372,
+      "loss": 0.4679,
+      "step": 1065
+    },
+    {
+      "epoch": 5.257985257985258,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00010798732605534006,
+      "loss": 0.4764,
+      "step": 1070
+    },
+    {
+      "epoch": 5.282555282555283,
+      "grad_norm": 0.90234375,
+      "learning_rate": 0.00010713002023486816,
+      "loss": 0.4834,
+      "step": 1075
+    },
+    {
+      "epoch": 5.3071253071253075,
+      "grad_norm": 0.98046875,
+      "learning_rate": 0.00010627218736569624,
+      "loss": 0.4853,
+      "step": 1080
+    },
+    {
+      "epoch": 5.3316953316953315,
+      "grad_norm": 0.875,
+      "learning_rate": 0.00010541389085854176,
+      "loss": 0.4823,
+      "step": 1085
+    },
+    {
+      "epoch": 5.356265356265356,
+      "grad_norm": 1.03125,
+      "learning_rate": 0.00010455519415839415,
+      "loss": 0.4838,
+      "step": 1090
+    },
+    {
+      "epoch": 5.38083538083538,
+      "grad_norm": 0.92578125,
+      "learning_rate": 0.00010369616073982491,
+      "loss": 0.4881,
+      "step": 1095
+    },
+    {
+      "epoch": 5.405405405405405,
+      "grad_norm": 0.88671875,
+      "learning_rate": 0.00010283685410229571,
+      "loss": 0.485,
+      "step": 1100
+    },
+    {
+      "epoch": 5.42997542997543,
+      "grad_norm": 1.0390625,
+      "learning_rate": 0.00010197733776546447,
+      "loss": 0.4854,
+      "step": 1105
+    },
+    {
+      "epoch": 5.454545454545454,
+      "grad_norm": 0.9609375,
+      "learning_rate": 0.00010111767526449004,
+      "loss": 0.4851,
+      "step": 1110
+    },
+    {
+      "epoch": 5.479115479115479,
+      "grad_norm": 1.265625,
+      "learning_rate": 0.00010025793014533558,
+      "loss": 0.4889,
+      "step": 1115
+    },
+    {
+      "epoch": 5.503685503685504,
+      "grad_norm": 0.87890625,
+      "learning_rate": 9.939816596007146e-05,
+      "loss": 0.4772,
+      "step": 1120
+    },
+    {
+      "epoch": 5.528255528255528,
+      "grad_norm": 1.1640625,
+      "learning_rate": 9.853844626217737e-05,
+      "loss": 0.4857,
+      "step": 1125
+    },
+    {
+      "epoch": 5.552825552825553,
+      "grad_norm": 1.0703125,
+      "learning_rate": 9.767883460184443e-05,
+      "loss": 0.4869,
+      "step": 1130
+    },
+    {
+      "epoch": 5.577395577395578,
+      "grad_norm": 0.88671875,
+      "learning_rate": 9.681939452127784e-05,
+      "loss": 0.4806,
+      "step": 1135
+    },
+    {
+      "epoch": 5.601965601965602,
+      "grad_norm": 1.2421875,
+      "learning_rate": 9.596018954999953e-05,
+      "loss": 0.4897,
+      "step": 1140
+    },
+    {
+      "epoch": 5.6265356265356266,
+      "grad_norm": 1.015625,
+      "learning_rate": 9.510128320015224e-05,
+      "loss": 0.4749,
+      "step": 1145
+    },
+    {
+      "epoch": 5.651105651105651,
+      "grad_norm": 0.93359375,
+      "learning_rate": 9.424273896180482e-05,
+      "loss": 0.4837,
+      "step": 1150
+    },
+    {
+      "epoch": 5.675675675675675,
+      "grad_norm": 0.8984375,
+      "learning_rate": 9.338462029825886e-05,
+      "loss": 0.4928,
+      "step": 1155
+    },
+    {
+      "epoch": 5.7002457002457,
+      "grad_norm": 0.97265625,
+      "learning_rate": 9.252699064135758e-05,
+      "loss": 0.477,
+      "step": 1160
+    },
+    {
+      "epoch": 5.724815724815725,
+      "grad_norm": 0.87890625,
+      "learning_rate": 9.166991338679715e-05,
+      "loss": 0.4882,
+      "step": 1165
+    },
+    {
+      "epoch": 5.749385749385749,
+      "grad_norm": 1.078125,
+      "learning_rate": 9.081345188944019e-05,
+      "loss": 0.4902,
+      "step": 1170
+    },
+    {
+      "epoch": 5.773955773955774,
+      "grad_norm": 0.97265625,
+      "learning_rate": 8.995766945863277e-05,
+      "loss": 0.4873,
+      "step": 1175
+    },
+    {
+      "epoch": 5.798525798525798,
+      "grad_norm": 1.0625,
+      "learning_rate": 8.91026293535247e-05,
+      "loss": 0.4864,
+      "step": 1180
+    },
+    {
+      "epoch": 5.823095823095823,
+      "grad_norm": 1.1953125,
+      "learning_rate": 8.82483947783932e-05,
+      "loss": 0.4782,
+      "step": 1185
+    },
+    {
+      "epoch": 5.847665847665848,
+      "grad_norm": 0.921875,
+      "learning_rate": 8.739502887797107e-05,
+      "loss": 0.4918,
+      "step": 1190
+    },
+    {
+      "epoch": 5.872235872235873,
+      "grad_norm": 0.9296875,
+      "learning_rate": 8.654259473277892e-05,
+      "loss": 0.4955,
+      "step": 1195
+    },
+    {
+      "epoch": 5.896805896805897,
+      "grad_norm": 1.015625,
+      "learning_rate": 8.569115535446228e-05,
+      "loss": 0.4802,
+      "step": 1200
+    },
+    {
+      "epoch": 5.921375921375922,
+      "grad_norm": 0.8984375,
+      "learning_rate": 8.484077368113399e-05,
+      "loss": 0.4832,
+      "step": 1205
+    },
+    {
+      "epoch": 5.945945945945946,
+      "grad_norm": 0.87890625,
+      "learning_rate": 8.399151257272156e-05,
+      "loss": 0.4847,
+      "step": 1210
+    },
+    {
+      "epoch": 5.9705159705159705,
+      "grad_norm": 0.87890625,
+      "learning_rate": 8.314343480632078e-05,
+      "loss": 0.48,
+      "step": 1215
+    },
+    {
+      "epoch": 5.995085995085995,
+      "grad_norm": 0.87890625,
+      "learning_rate": 8.229660307155518e-05,
+      "loss": 0.4821,
+      "step": 1220
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 3.3285133838653564,
+      "eval_runtime": 2.0506,
+      "eval_samples_per_second": 4.877,
+      "eval_steps_per_second": 0.975,
+      "step": 1221
+    },
+    {
+      "epoch": 6.019656019656019,
+      "grad_norm": 0.80078125,
+      "learning_rate": 8.145107996594206e-05,
+      "loss": 0.4087,
+      "step": 1225
+    },
+    {
+      "epoch": 6.044226044226044,
+      "grad_norm": 0.98046875,
+      "learning_rate": 8.060692799026522e-05,
+      "loss": 0.3843,
+      "step": 1230
+    },
+    {
+      "epoch": 6.068796068796069,
+      "grad_norm": 0.828125,
+      "learning_rate": 7.976420954395518e-05,
+      "loss": 0.3844,
+      "step": 1235
+    },
+    {
+      "epoch": 6.093366093366093,
+      "grad_norm": 0.984375,
+      "learning_rate": 7.892298692047621e-05,
+      "loss": 0.3909,
+      "step": 1240
+    },
+    {
+      "epoch": 6.117936117936118,
+      "grad_norm": 1.1171875,
+      "learning_rate": 7.808332230272209e-05,
+      "loss": 0.393,
+      "step": 1245
+    },
+    {
+      "epoch": 6.142506142506143,
+      "grad_norm": 0.9453125,
+      "learning_rate": 7.724527775841914e-05,
+      "loss": 0.3818,
+      "step": 1250
+    },
+    {
+      "epoch": 6.167076167076167,
+      "grad_norm": 0.921875,
+      "learning_rate": 7.64089152355385e-05,
+      "loss": 0.3938,
+      "step": 1255
+    },
+    {
+      "epoch": 6.191646191646192,
+      "grad_norm": 0.90234375,
+      "learning_rate": 7.55742965577169e-05,
+      "loss": 0.3885,
+      "step": 1260
+    },
+    {
+      "epoch": 6.216216216216216,
+      "grad_norm": 0.94921875,
+      "learning_rate": 7.474148341968652e-05,
+      "loss": 0.3889,
+      "step": 1265
+    },
+    {
+      "epoch": 6.240786240786241,
+      "grad_norm": 1.171875,
+      "learning_rate": 7.391053738271466e-05,
+      "loss": 0.3932,
+      "step": 1270
+    },
+    {
+      "epoch": 6.2653562653562656,
+      "grad_norm": 0.91015625,
+      "learning_rate": 7.308151987005326e-05,
+      "loss": 0.3823,
+      "step": 1275
+    },
+    {
+      "epoch": 6.2899262899262895,
+      "grad_norm": 0.91015625,
+      "learning_rate": 7.225449216239821e-05,
+      "loss": 0.3857,
+      "step": 1280
+    },
+    {
+      "epoch": 6.314496314496314,
+      "grad_norm": 0.8671875,
+      "learning_rate": 7.142951539335981e-05,
+      "loss": 0.3973,
+      "step": 1285
+    },
+    {
+      "epoch": 6.339066339066339,
+      "grad_norm": 0.91796875,
+      "learning_rate": 7.060665054494362e-05,
+      "loss": 0.3901,
+      "step": 1290
+    },
+    {
+      "epoch": 6.363636363636363,
+      "grad_norm": 0.89453125,
+      "learning_rate": 6.978595844304271e-05,
+      "loss": 0.386,
+      "step": 1295
+    },
+    {
+      "epoch": 6.388206388206388,
+      "grad_norm": 1.03125,
+      "learning_rate": 6.89674997529416e-05,
+      "loss": 0.3881,
+      "step": 1300
+    },
+    {
+      "epoch": 6.412776412776413,
+      "grad_norm": 0.89453125,
+      "learning_rate": 6.815133497483157e-05,
+      "loss": 0.4006,
+      "step": 1305
+    },
+    {
+      "epoch": 6.437346437346437,
+      "grad_norm": 1.0234375,
+      "learning_rate": 6.733752443933878e-05,
+      "loss": 0.397,
+      "step": 1310
+    },
+    {
+      "epoch": 6.461916461916462,
+      "grad_norm": 1.0703125,
+      "learning_rate": 6.65261283030646e-05,
+      "loss": 0.3972,
+      "step": 1315
+    },
+    {
+      "epoch": 6.486486486486487,
+      "grad_norm": 1.0390625,
+      "learning_rate": 6.571720654413877e-05,
+      "loss": 0.3965,
+      "step": 1320
+    },
+    {
+      "epoch": 6.511056511056511,
+      "grad_norm": 0.9296875,
+      "learning_rate": 6.491081895778588e-05,
+      "loss": 0.3961,
+      "step": 1325
+    },
+    {
+      "epoch": 6.535626535626536,
+      "grad_norm": 1.0625,
+      "learning_rate": 6.410702515190543e-05,
+      "loss": 0.3919,
+      "step": 1330
+    },
+    {
+      "epoch": 6.560196560196561,
+      "grad_norm": 0.9453125,
+      "learning_rate": 6.330588454266542e-05,
+      "loss": 0.3916,
+      "step": 1335
+    },
+    {
+      "epoch": 6.584766584766585,
+      "grad_norm": 0.9453125,
+      "learning_rate": 6.250745635011048e-05,
+      "loss": 0.4089,
+      "step": 1340
+    },
+    {
+      "epoch": 6.6093366093366095,
+      "grad_norm": 0.9140625,
+      "learning_rate": 6.171179959378437e-05,
+      "loss": 0.401,
+      "step": 1345
+    },
+    {
+      "epoch": 6.6339066339066335,
+      "grad_norm": 0.89453125,
+      "learning_rate": 6.0918973088367116e-05,
+      "loss": 0.3927,
+      "step": 1350
+    },
+    {
+      "epoch": 6.658476658476658,
+      "grad_norm": 1.0,
+      "learning_rate": 6.012903543932766e-05,
+      "loss": 0.3899,
+      "step": 1355
+    },
+    {
+      "epoch": 6.683046683046683,
+      "grad_norm": 0.84765625,
+      "learning_rate": 5.934204503859158e-05,
+      "loss": 0.3952,
+      "step": 1360
+    },
+    {
+      "epoch": 6.707616707616707,
+      "grad_norm": 0.98828125,
+      "learning_rate": 5.8558060060224817e-05,
+      "loss": 0.3917,
+      "step": 1365
+    },
+    {
+      "epoch": 6.732186732186732,
+      "grad_norm": 0.92578125,
+      "learning_rate": 5.777713845613364e-05,
+      "loss": 0.3878,
+      "step": 1370
+    },
+    {
+      "epoch": 6.756756756756757,
+      "grad_norm": 0.95703125,
+      "learning_rate": 5.699933795178052e-05,
+      "loss": 0.4066,
+      "step": 1375
+    },
+    {
+      "epoch": 6.781326781326781,
+      "grad_norm": 0.90625,
+      "learning_rate": 5.622471604191746e-05,
+      "loss": 0.3993,
+      "step": 1380
+    },
+    {
+      "epoch": 6.805896805896806,
+      "grad_norm": 0.98828125,
+      "learning_rate": 5.545332998633572e-05,
+      "loss": 0.3975,
+      "step": 1385
+    },
+    {
+      "epoch": 6.830466830466831,
+      "grad_norm": 0.96875,
+      "learning_rate": 5.46852368056334e-05,
+      "loss": 0.4003,
+      "step": 1390
+    },
+    {
+      "epoch": 6.855036855036855,
+      "grad_norm": 0.828125,
+      "learning_rate": 5.392049327700026e-05,
+      "loss": 0.3978,
+      "step": 1395
+    },
+    {
+      "epoch": 6.87960687960688,
+      "grad_norm": 0.90625,
+      "learning_rate": 5.3159155930021e-05,
+      "loss": 0.4017,
+      "step": 1400
+    },
+    {
+      "epoch": 6.9041769041769046,
+      "grad_norm": 0.90625,
+      "learning_rate": 5.2401281042496494e-05,
+      "loss": 0.3992,
+      "step": 1405
+    },
+    {
+      "epoch": 6.9287469287469285,
+      "grad_norm": 0.95703125,
+      "learning_rate": 5.164692463628378e-05,
+      "loss": 0.3965,
+      "step": 1410
+    },
+    {
+      "epoch": 6.953316953316953,
+      "grad_norm": 0.9140625,
+      "learning_rate": 5.0896142473154987e-05,
+      "loss": 0.3883,
+      "step": 1415
+    },
+    {
+      "epoch": 6.977886977886978,
+      "grad_norm": 0.95703125,
+      "learning_rate": 5.014899005067524e-05,
+      "loss": 0.3899,
+      "step": 1420
+    },
+    {
+      "epoch": 6.997542997542998,
+      "eval_loss": 3.898437023162842,
+      "eval_runtime": 2.0525,
+      "eval_samples_per_second": 4.872,
+      "eval_steps_per_second": 0.974,
+      "step": 1424
+    },
+    {
+      "epoch": 7.002457002457002,
+      "grad_norm": 1.0546875,
+      "learning_rate": 4.940552259810063e-05,
+      "loss": 0.3846,
+      "step": 1425
+    },
+    {
+      "epoch": 7.027027027027027,
+      "grad_norm": 0.82421875,
+      "learning_rate": 4.866579507229545e-05,
+      "loss": 0.3325,
+      "step": 1430
+    },
+    {
+      "epoch": 7.051597051597051,
+      "grad_norm": 1.0,
+      "learning_rate": 4.792986215366976e-05,
+      "loss": 0.3266,
+      "step": 1435
+    },
+    {
+      "epoch": 7.076167076167076,
+      "grad_norm": 1.0078125,
+      "learning_rate": 4.7197778242137755e-05,
+      "loss": 0.3295,
+      "step": 1440
+    },
+    {
+      "epoch": 7.100737100737101,
+      "grad_norm": 0.80078125,
+      "learning_rate": 4.646959745309609e-05,
+      "loss": 0.3279,
+      "step": 1445
+    },
+    {
+      "epoch": 7.125307125307125,
+      "grad_norm": 0.96875,
+      "learning_rate": 4.574537361342407e-05,
+      "loss": 0.324,
+      "step": 1450
+    },
+    {
+      "epoch": 7.14987714987715,
+      "grad_norm": 0.87109375,
+      "learning_rate": 4.502516025750455e-05,
+      "loss": 0.319,
+      "step": 1455
+    },
+    {
+      "epoch": 7.174447174447175,
+      "grad_norm": 0.8125,
+      "learning_rate": 4.430901062326681e-05,
+      "loss": 0.3242,
+      "step": 1460
+    },
+    {
+      "epoch": 7.199017199017199,
+      "grad_norm": 0.8515625,
+      "learning_rate": 4.359697764825123e-05,
+      "loss": 0.3172,
+      "step": 1465
+    },
+    {
+      "epoch": 7.223587223587224,
+      "grad_norm": 0.9140625,
+      "learning_rate": 4.288911396569599e-05,
+      "loss": 0.3275,
+      "step": 1470
+    },
+    {
+      "epoch": 7.2481572481572485,
+      "grad_norm": 0.8828125,
+      "learning_rate": 4.21854719006467e-05,
+      "loss": 0.332,
+      "step": 1475
+    },
+    {
+      "epoch": 7.2727272727272725,
+      "grad_norm": 1.015625,
+      "learning_rate": 4.148610346608837e-05,
+      "loss": 0.3359,
+      "step": 1480
+    },
+    {
+      "epoch": 7.297297297297297,
+      "grad_norm": 0.87109375,
+      "learning_rate": 4.079106035910073e-05,
+      "loss": 0.3242,
+      "step": 1485
+    },
+    {
+      "epoch": 7.321867321867322,
+      "grad_norm": 0.86328125,
+      "learning_rate": 4.010039395703664e-05,
+      "loss": 0.3273,
+      "step": 1490
+    },
+    {
+      "epoch": 7.346437346437346,
+      "grad_norm": 0.9140625,
+      "learning_rate": 3.94141553137245e-05,
+      "loss": 0.3274,
+      "step": 1495
+    },
+    {
+      "epoch": 7.371007371007371,
+      "grad_norm": 0.88671875,
+      "learning_rate": 3.873239515569429e-05,
+      "loss": 0.3266,
+      "step": 1500
+    },
+    {
+      "epoch": 7.395577395577396,
+      "grad_norm": 0.84765625,
+      "learning_rate": 3.80551638784277e-05,
+      "loss": 0.3363,
+      "step": 1505
+    },
+    {
+      "epoch": 7.42014742014742,
+      "grad_norm": 0.87109375,
+      "learning_rate": 3.738251154263333e-05,
+      "loss": 0.335,
+      "step": 1510
+    },
+    {
+      "epoch": 7.444717444717445,
+      "grad_norm": 0.84765625,
+      "learning_rate": 3.671448787054571e-05,
+      "loss": 0.3305,
+      "step": 1515
+    },
+    {
+      "epoch": 7.469287469287469,
+      "grad_norm": 0.8515625,
+      "learning_rate": 3.605114224225028e-05,
+      "loss": 0.332,
+      "step": 1520
+    },
+    {
+      "epoch": 7.493857493857494,
+      "grad_norm": 0.89453125,
+      "learning_rate": 3.5392523692033006e-05,
+      "loss": 0.3261,
+      "step": 1525
+    },
+    {
+      "epoch": 7.518427518427519,
+      "grad_norm": 1.09375,
+      "learning_rate": 3.473868090475574e-05,
+      "loss": 0.3363,
+      "step": 1530
+    },
+    {
+      "epoch": 7.542997542997543,
+      "grad_norm": 0.88671875,
+      "learning_rate": 3.408966221225773e-05,
+      "loss": 0.3239,
+      "step": 1535
+    },
+    {
+      "epoch": 7.5675675675675675,
+      "grad_norm": 0.875,
+      "learning_rate": 3.3445515589782574e-05,
+      "loss": 0.3301,
+      "step": 1540
+    },
+    {
+      "epoch": 7.592137592137592,
+      "grad_norm": 0.875,
+      "learning_rate": 3.2806288652432174e-05,
+      "loss": 0.3228,
+      "step": 1545
+    },
+    {
+      "epoch": 7.616707616707616,
+      "grad_norm": 0.8828125,
+      "learning_rate": 3.217202865164697e-05,
+      "loss": 0.3389,
+      "step": 1550
+    },
+    {
+      "epoch": 7.641277641277641,
+      "grad_norm": 0.859375,
+      "learning_rate": 3.154278247171314e-05,
+      "loss": 0.3302,
+      "step": 1555
+    },
+    {
+      "epoch": 7.665847665847666,
+      "grad_norm": 0.89453125,
+      "learning_rate": 3.09185966262968e-05,
+      "loss": 0.3245,
+      "step": 1560
+    },
+    {
+      "epoch": 7.69041769041769,
+      "grad_norm": 0.86328125,
+      "learning_rate": 3.0299517255005937e-05,
+      "loss": 0.3306,
+      "step": 1565
+    },
+    {
+      "epoch": 7.714987714987715,
+      "grad_norm": 0.90625,
+      "learning_rate": 2.9685590119979688e-05,
+      "loss": 0.3322,
+      "step": 1570
+    },
+    {
+      "epoch": 7.739557739557739,
+      "grad_norm": 0.8828125,
+      "learning_rate": 2.9076860602505564e-05,
+      "loss": 0.3327,
+      "step": 1575
+    },
+    {
+      "epoch": 7.764127764127764,
+      "grad_norm": 0.953125,
+      "learning_rate": 2.8473373699664997e-05,
+      "loss": 0.3342,
+      "step": 1580
+    },
+    {
+      "epoch": 7.788697788697789,
+      "grad_norm": 0.89453125,
+      "learning_rate": 2.7875174021007e-05,
+      "loss": 0.3318,
+      "step": 1585
+    },
+    {
+      "epoch": 7.813267813267814,
+      "grad_norm": 0.85546875,
+      "learning_rate": 2.728230578525086e-05,
+      "loss": 0.3357,
+      "step": 1590
+    },
+    {
+      "epoch": 7.837837837837838,
+      "grad_norm": 0.8984375,
+      "learning_rate": 2.669481281701739e-05,
+      "loss": 0.3345,
+      "step": 1595
+    },
+    {
+      "epoch": 7.862407862407863,
+      "grad_norm": 0.94921875,
+      "learning_rate": 2.6112738543589312e-05,
+      "loss": 0.3324,
+      "step": 1600
+    },
+    {
+      "epoch": 7.886977886977887,
+      "grad_norm": 0.86328125,
+      "learning_rate": 2.553612599170143e-05,
+      "loss": 0.3278,
+      "step": 1605
+    },
+    {
+      "epoch": 7.9115479115479115,
+      "grad_norm": 0.8046875,
+      "learning_rate": 2.496501778435977e-05,
+      "loss": 0.323,
+      "step": 1610
+    },
+    {
+      "epoch": 7.936117936117936,
+      "grad_norm": 0.953125,
+      "learning_rate": 2.4399456137691147e-05,
+      "loss": 0.3364,
+      "step": 1615
+    },
+    {
+      "epoch": 7.96068796068796,
+      "grad_norm": 0.85546875,
+      "learning_rate": 2.3839482857822458e-05,
+      "loss": 0.3348,
+      "step": 1620
+    },
+    {
+      "epoch": 7.985257985257985,
+      "grad_norm": 0.92578125,
+      "learning_rate": 2.328513933779034e-05,
+      "loss": 0.3321,
+      "step": 1625
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 4.43484354019165,
+      "eval_runtime": 2.0469,
+      "eval_samples_per_second": 4.885,
+      "eval_steps_per_second": 0.977,
+      "step": 1628
+    },
+    {
+      "epoch": 8.00982800982801,
+      "grad_norm": 0.703125,
+      "learning_rate": 2.2736466554481617e-05,
+      "loss": 0.3114,
+      "step": 1630
+    },
+    {
+      "epoch": 8.034398034398034,
+      "grad_norm": 0.78125,
+      "learning_rate": 2.2193505065604014e-05,
+      "loss": 0.3037,
+      "step": 1635
+    },
+    {
+      "epoch": 8.058968058968059,
+      "grad_norm": 0.78515625,
+      "learning_rate": 2.1656295006688353e-05,
+      "loss": 0.2959,
+      "step": 1640
+    },
+    {
+      "epoch": 8.083538083538084,
+      "grad_norm": 0.88671875,
+      "learning_rate": 2.1124876088121692e-05,
+      "loss": 0.3102,
+      "step": 1645
+    },
+    {
+      "epoch": 8.108108108108109,
+      "grad_norm": 0.83984375,
+      "learning_rate": 2.0599287592211968e-05,
+      "loss": 0.2975,
+      "step": 1650
+    },
+    {
+      "epoch": 8.132678132678132,
+      "grad_norm": 0.82421875,
+      "learning_rate": 2.0079568370284128e-05,
+      "loss": 0.2934,
+      "step": 1655
+    },
+    {
+      "epoch": 8.157248157248157,
+      "grad_norm": 0.84765625,
+      "learning_rate": 1.956575683980846e-05,
+      "loss": 0.299,
+      "step": 1660
+    },
+    {
+      "epoch": 8.181818181818182,
+      "grad_norm": 0.81640625,
+      "learning_rate": 1.9057890981560677e-05,
+      "loss": 0.2952,
+      "step": 1665
+    },
+    {
+      "epoch": 8.206388206388207,
+      "grad_norm": 0.8046875,
+      "learning_rate": 1.85560083368143e-05,
+      "loss": 0.3067,
+      "step": 1670
+    },
+    {
+      "epoch": 8.230958230958231,
+      "grad_norm": 0.8203125,
+      "learning_rate": 1.806014600456588e-05,
+      "loss": 0.2978,
+      "step": 1675
+    },
+    {
+      "epoch": 8.255528255528256,
+      "grad_norm": 0.80078125,
+      "learning_rate": 1.757034063879235e-05,
+      "loss": 0.2973,
+      "step": 1680
+    },
+    {
+      "epoch": 8.28009828009828,
+      "grad_norm": 0.796875,
+      "learning_rate": 1.708662844574178e-05,
+      "loss": 0.2983,
+      "step": 1685
+    },
+    {
+      "epoch": 8.304668304668304,
+      "grad_norm": 0.78125,
+      "learning_rate": 1.6609045181256976e-05,
+      "loss": 0.2991,
+      "step": 1690
+    },
+    {
+      "epoch": 8.32923832923833,
+      "grad_norm": 0.8203125,
+      "learning_rate": 1.61376261481323e-05,
+      "loss": 0.3102,
+      "step": 1695
+    },
+    {
+      "epoch": 8.353808353808354,
+      "grad_norm": 0.8828125,
+      "learning_rate": 1.5672406193504384e-05,
+      "loss": 0.2984,
+      "step": 1700
+    },
+    {
+      "epoch": 8.378378378378379,
+      "grad_norm": 0.7578125,
+      "learning_rate": 1.5213419706275878e-05,
+      "loss": 0.298,
+      "step": 1705
+    },
+    {
+      "epoch": 8.402948402948404,
+      "grad_norm": 0.8046875,
+      "learning_rate": 1.4760700614573731e-05,
+      "loss": 0.3021,
+      "step": 1710
+    },
+    {
+      "epoch": 8.427518427518427,
+      "grad_norm": 0.84375,
+      "learning_rate": 1.4314282383241096e-05,
+      "loss": 0.3068,
+      "step": 1715
+    },
+    {
+      "epoch": 8.452088452088452,
+      "grad_norm": 0.82421875,
+      "learning_rate": 1.3874198011363582e-05,
+      "loss": 0.3038,
+      "step": 1720
+    },
+    {
+      "epoch": 8.476658476658477,
+      "grad_norm": 0.8359375,
+      "learning_rate": 1.3440480029830127e-05,
+      "loss": 0.3024,
+      "step": 1725
+    },
+    {
+      "epoch": 8.501228501228502,
+      "grad_norm": 0.86328125,
+      "learning_rate": 1.301316049892818e-05,
+      "loss": 0.3018,
+      "step": 1730
+    },
+    {
+      "epoch": 8.525798525798526,
+      "grad_norm": 0.87890625,
+      "learning_rate": 1.2592271005973888e-05,
+      "loss": 0.3034,
+      "step": 1735
+    },
+    {
+      "epoch": 8.55036855036855,
+      "grad_norm": 0.8046875,
+      "learning_rate": 1.2177842662977135e-05,
+      "loss": 0.2999,
+      "step": 1740
+    },
+    {
+      "epoch": 8.574938574938574,
+      "grad_norm": 0.91796875,
+      "learning_rate": 1.1769906104341832e-05,
+      "loss": 0.2997,
+      "step": 1745
+    },
+    {
+      "epoch": 8.5995085995086,
+      "grad_norm": 0.8203125,
+      "learning_rate": 1.136849148460125e-05,
+      "loss": 0.3042,
+      "step": 1750
+    },
+    {
+      "epoch": 8.624078624078624,
+      "grad_norm": 0.88671875,
+      "learning_rate": 1.0973628476189257e-05,
+      "loss": 0.2933,
+      "step": 1755
+    },
+    {
+      "epoch": 8.64864864864865,
+      "grad_norm": 0.8046875,
+      "learning_rate": 1.0585346267246743e-05,
+      "loss": 0.2983,
+      "step": 1760
+    },
+    {
+      "epoch": 8.673218673218674,
+      "grad_norm": 0.7890625,
+      "learning_rate": 1.0203673559464089e-05,
+      "loss": 0.2963,
+      "step": 1765
+    },
+    {
+      "epoch": 8.697788697788697,
+      "grad_norm": 0.83984375,
+      "learning_rate": 9.82863856595968e-06,
+      "loss": 0.2952,
+      "step": 1770
+    },
+    {
+      "epoch": 8.722358722358722,
+      "grad_norm": 0.8359375,
+      "learning_rate": 9.460269009194167e-06,
+      "loss": 0.304,
+      "step": 1775
+    },
+    {
+      "epoch": 8.746928746928747,
+      "grad_norm": 0.84765625,
+      "learning_rate": 9.098592118921435e-06,
+      "loss": 0.3037,
+      "step": 1780
+    },
+    {
+      "epoch": 8.771498771498772,
+      "grad_norm": 0.79296875,
+      "learning_rate": 8.74363463017569e-06,
+      "loss": 0.3058,
+      "step": 1785
+    },
+    {
+      "epoch": 8.796068796068797,
+      "grad_norm": 1.03125,
+      "learning_rate": 8.395422781295192e-06,
+      "loss": 0.3031,
+      "step": 1790
+    },
+    {
+      "epoch": 8.82063882063882,
+      "grad_norm": 0.875,
+      "learning_rate": 8.053982311982867e-06,
+      "loss": 0.3017,
+      "step": 1795
+    },
+    {
+      "epoch": 8.845208845208845,
+      "grad_norm": 0.9453125,
+      "learning_rate": 7.719338461403435e-06,
+      "loss": 0.2978,
+      "step": 1800
+    },
+    {
+      "epoch": 8.86977886977887,
+      "grad_norm": 0.82421875,
+      "learning_rate": 7.3915159663179075e-06,
+      "loss": 0.2905,
+      "step": 1805
+    },
+    {
+      "epoch": 8.894348894348894,
+      "grad_norm": 0.81640625,
+      "learning_rate": 7.070539059254977e-06,
+      "loss": 0.2957,
+      "step": 1810
+    },
+    {
+      "epoch": 8.91891891891892,
+      "grad_norm": 0.7890625,
+      "learning_rate": 6.756431466719737e-06,
+      "loss": 0.2988,
+      "step": 1815
+    },
+    {
+      "epoch": 8.943488943488944,
+      "grad_norm": 0.84375,
+      "learning_rate": 6.4492164074399065e-06,
+      "loss": 0.3019,
+      "step": 1820
+    },
+    {
+      "epoch": 8.968058968058967,
+      "grad_norm": 0.8359375,
+      "learning_rate": 6.148916590649434e-06,
+      "loss": 0.3051,
+      "step": 1825
+    },
+    {
+      "epoch": 8.992628992628992,
+      "grad_norm": 0.859375,
+      "learning_rate": 5.8555542144098865e-06,
+      "loss": 0.3089,
+      "step": 1830
+    },
+    {
+      "epoch": 8.997542997542997,
+      "eval_loss": 4.689078330993652,
+      "eval_runtime": 2.054,
+      "eval_samples_per_second": 4.868,
+      "eval_steps_per_second": 0.974,
+      "step": 1831
+    },
+    {
+      "epoch": 9.017199017199017,
+      "grad_norm": 0.78515625,
+      "learning_rate": 5.569150963969494e-06,
+      "loss": 0.2973,
+      "step": 1835
+    },
+    {
+      "epoch": 9.041769041769042,
+      "grad_norm": 0.80078125,
+      "learning_rate": 5.289728010160366e-06,
+      "loss": 0.3,
+      "step": 1840
+    },
+    {
+      "epoch": 9.066339066339067,
+      "grad_norm": 0.7578125,
+      "learning_rate": 5.0173060078333225e-06,
+      "loss": 0.299,
+      "step": 1845
+    },
+    {
+      "epoch": 9.090909090909092,
+      "grad_norm": 0.796875,
+      "learning_rate": 4.7519050943312325e-06,
+      "loss": 0.2971,
+      "step": 1850
+    },
+    {
+      "epoch": 9.115479115479115,
+      "grad_norm": 0.78125,
+      "learning_rate": 4.493544888000467e-06,
+      "loss": 0.2968,
+      "step": 1855
+    },
+    {
+      "epoch": 9.14004914004914,
+      "grad_norm": 0.765625,
+      "learning_rate": 4.242244486740643e-06,
+      "loss": 0.295,
+      "step": 1860
+    },
+    {
+      "epoch": 9.164619164619165,
+      "grad_norm": 0.8125,
+      "learning_rate": 3.99802246659301e-06,
+      "loss": 0.2911,
+      "step": 1865
+    },
+    {
+      "epoch": 9.18918918918919,
+      "grad_norm": 0.76953125,
+      "learning_rate": 3.760896880367215e-06,
+      "loss": 0.2916,
+      "step": 1870
+    },
+    {
+      "epoch": 9.213759213759214,
+      "grad_norm": 0.7890625,
+      "learning_rate": 3.530885256306915e-06,
+      "loss": 0.2998,
+      "step": 1875
+    },
+    {
+      "epoch": 9.238329238329237,
+      "grad_norm": 0.8046875,
+      "learning_rate": 3.308004596794101e-06,
+      "loss": 0.2948,
+      "step": 1880
+    },
+    {
+      "epoch": 9.262899262899262,
+      "grad_norm": 0.81640625,
+      "learning_rate": 3.092271377092215e-06,
+      "loss": 0.2917,
+      "step": 1885
+    },
+    {
+      "epoch": 9.287469287469287,
+      "grad_norm": 0.75,
+      "learning_rate": 2.8837015441283586e-06,
+      "loss": 0.295,
+      "step": 1890
+    },
+    {
+      "epoch": 9.312039312039312,
+      "grad_norm": 0.74609375,
+      "learning_rate": 2.682310515314512e-06,
+      "loss": 0.2928,
+      "step": 1895
+    },
+    {
+      "epoch": 9.336609336609337,
+      "grad_norm": 0.8046875,
+      "learning_rate": 2.488113177407869e-06,
+      "loss": 0.2909,
+      "step": 1900
+    },
+    {
+      "epoch": 9.361179361179362,
+      "grad_norm": 0.75,
+      "learning_rate": 2.3011238854103947e-06,
+      "loss": 0.2929,
+      "step": 1905
+    },
+    {
+      "epoch": 9.385749385749385,
+      "grad_norm": 0.796875,
+      "learning_rate": 2.1213564615077065e-06,
+      "loss": 0.2936,
+      "step": 1910
+    },
+    {
+      "epoch": 9.41031941031941,
+      "grad_norm": 0.79296875,
+      "learning_rate": 1.9488241940473828e-06,
+      "loss": 0.2941,
+      "step": 1915
+    },
+    {
+      "epoch": 9.434889434889435,
+      "grad_norm": 0.76171875,
+      "learning_rate": 1.783539836556669e-06,
+      "loss": 0.2941,
+      "step": 1920
+    },
+    {
+      "epoch": 9.45945945945946,
+      "grad_norm": 0.78515625,
+      "learning_rate": 1.6255156067997323e-06,
+      "loss": 0.2869,
+      "step": 1925
+    },
+    {
+      "epoch": 9.484029484029485,
+      "grad_norm": 0.80859375,
+      "learning_rate": 1.474763185874517e-06,
+      "loss": 0.3029,
+      "step": 1930
+    },
+    {
+      "epoch": 9.50859950859951,
+      "grad_norm": 0.80859375,
+      "learning_rate": 1.3312937173493577e-06,
+      "loss": 0.2943,
+      "step": 1935
+    },
+    {
+      "epoch": 9.533169533169533,
+      "grad_norm": 0.83984375,
+      "learning_rate": 1.19511780643915e-06,
+      "loss": 0.2963,
+      "step": 1940
+    },
+    {
+      "epoch": 9.557739557739557,
+      "grad_norm": 0.765625,
+      "learning_rate": 1.066245519221465e-06,
+      "loss": 0.3002,
+      "step": 1945
+    },
+    {
+      "epoch": 9.582309582309582,
+      "grad_norm": 0.76171875,
+      "learning_rate": 9.446863818924679e-07,
+      "loss": 0.2988,
+      "step": 1950
+    },
+    {
+      "epoch": 9.606879606879607,
+      "grad_norm": 0.77734375,
+      "learning_rate": 8.304493800627589e-07,
+      "loss": 0.297,
+      "step": 1955
+    },
+    {
+      "epoch": 9.631449631449632,
+      "grad_norm": 0.75,
+      "learning_rate": 7.235429580931152e-07,
+      "loss": 0.2902,
+      "step": 1960
+    },
+    {
+      "epoch": 9.656019656019655,
+      "grad_norm": 0.859375,
+      "learning_rate": 6.239750184703464e-07,
+      "loss": 0.2933,
+      "step": 1965
+    },
+    {
+      "epoch": 9.68058968058968,
+      "grad_norm": 0.76953125,
+      "learning_rate": 5.317529212230721e-07,
+      "loss": 0.2943,
+      "step": 1970
+    },
+    {
+      "epoch": 9.705159705159705,
+      "grad_norm": 0.78125,
+      "learning_rate": 4.4688348337774686e-07,
+      "loss": 0.3049,
+      "step": 1975
+    },
+    {
+      "epoch": 9.72972972972973,
+      "grad_norm": 0.83984375,
+      "learning_rate": 3.693729784546962e-07,
+      "loss": 0.2876,
+      "step": 1980
+    },
+    {
+      "epoch": 9.754299754299755,
+      "grad_norm": 0.78515625,
+      "learning_rate": 2.9922713600439854e-07,
+      "loss": 0.2971,
+      "step": 1985
+    },
+    {
+      "epoch": 9.77886977886978,
+      "grad_norm": 0.765625,
+      "learning_rate": 2.3645114118395762e-07,
+      "loss": 0.288,
+      "step": 1990
+    },
+    {
+      "epoch": 9.803439803439803,
+      "grad_norm": 0.84765625,
+      "learning_rate": 1.8104963437381993e-07,
+      "loss": 0.2951,
+      "step": 1995
+    },
+    {
+      "epoch": 9.828009828009828,
+      "grad_norm": 0.7578125,
+      "learning_rate": 1.3302671083474938e-07,
+      "loss": 0.2927,
+      "step": 2000
+    },
+    {
+      "epoch": 9.852579852579852,
+      "grad_norm": 0.78515625,
+      "learning_rate": 9.238592040512472e-08,
+      "loss": 0.2932,
+      "step": 2005
+    },
+    {
+      "epoch": 9.877149877149877,
+      "grad_norm": 0.75390625,
+      "learning_rate": 5.913026723850523e-08,
+      "loss": 0.2913,
+      "step": 2010
+    },
+    {
+      "epoch": 9.901719901719902,
+      "grad_norm": 0.8046875,
+      "learning_rate": 3.3262209581619297e-08,
+      "loss": 0.2997,
+      "step": 2015
+    },
+    {
+      "epoch": 9.926289926289925,
+      "grad_norm": 0.85546875,
+      "learning_rate": 1.4783659592576548e-08,
+      "loss": 0.2977,
+      "step": 2020
+    },
+    {
+      "epoch": 9.95085995085995,
+      "grad_norm": 0.80078125,
+      "learning_rate": 3.6959831996030704e-09,
+      "loss": 0.2988,
+      "step": 2025
+    },
+    {
+      "epoch": 9.975429975429975,
+      "grad_norm": 0.84375,
+      "learning_rate": 0.0,
+      "loss": 0.3016,
+      "step": 2030
+    },
+    {
+      "epoch": 9.975429975429975,
+      "eval_loss": 4.717045307159424,
+      "eval_runtime": 2.0451,
+      "eval_samples_per_second": 4.89,
+      "eval_steps_per_second": 0.978,
+      "step": 2030
+    },
+    {
+      "epoch": 9.975429975429975,
+      "step": 2030,
+      "total_flos": 1.5518062706111283e+18,
+      "train_loss": 1.3545548074938394,
+      "train_runtime": 13205.4864,
+      "train_samples_per_second": 2.465,
+      "train_steps_per_second": 0.154
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2030,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 100,
+  "total_flos": 1.5518062706111283e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}