End of training

Browse files

Files changed (5) hide show

README.md +14 -2
all_results.json +16 -0
eval_results.json +10 -0
train_results.json +9 -0
trainer_state.json +2399 -0

README.md CHANGED Viewed

@@ -2,11 +2,23 @@
 library_name: transformers
 tags:
 - generated_from_trainer
 metrics:
 - accuracy
 model-index:
 - name: opt-babylm2-clean-spacy-32k_seed-42_1e-3
-  results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -14,7 +26,7 @@ should probably proofread and complete it, then remove this comment. -->
 # opt-babylm2-clean-spacy-32k_seed-42_1e-3
-This model was trained from scratch on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 3.0357
 - Accuracy: 0.4240

 library_name: transformers
 tags:
 - generated_from_trainer
+datasets:
+- kanishka/babylm2-clean-spacy
 metrics:
 - accuracy
 model-index:
 - name: opt-babylm2-clean-spacy-32k_seed-42_1e-3
+  results:
+  - task:
+      name: Causal Language Modeling
+      type: text-generation
+    dataset:
+      name: kanishka/babylm2-clean-spacy
+      type: kanishka/babylm2-clean-spacy
+    metrics:
+    - name: Accuracy
+      type: accuracy
+      value: 0.4239814649263961
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 # opt-babylm2-clean-spacy-32k_seed-42_1e-3
+This model was trained from scratch on the kanishka/babylm2-clean-spacy dataset.
 It achieves the following results on the evaluation set:
 - Loss: 3.0357
 - Accuracy: 0.4240

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 20.0,
+    "eval_accuracy": 0.4239814649263961,
+    "eval_loss": 3.035736322402954,
+    "eval_runtime": 112.5567,
+    "eval_samples": 52640,
+    "eval_samples_per_second": 467.675,
+    "eval_steps_per_second": 7.312,
+    "perplexity": 20.816299765730864,
+    "total_flos": 1.30265052217344e+18,
+    "train_loss": 2.6546336687942524,
+    "train_runtime": 44210.8645,
+    "train_samples": 498542,
+    "train_samples_per_second": 225.529,
+    "train_steps_per_second": 7.048
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "epoch": 20.0,
+    "eval_accuracy": 0.4239814649263961,
+    "eval_loss": 3.035736322402954,
+    "eval_runtime": 112.5567,
+    "eval_samples": 52640,
+    "eval_samples_per_second": 467.675,
+    "eval_steps_per_second": 7.312,
+    "perplexity": 20.816299765730864
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 20.0,
+    "total_flos": 1.30265052217344e+18,
+    "train_loss": 2.6546336687942524,
+    "train_runtime": 44210.8645,
+    "train_samples": 498542,
+    "train_samples_per_second": 225.529,
+    "train_steps_per_second": 7.048
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2399 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 20.0,
+  "eval_steps": 500,
+  "global_step": 311600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.06418485237483953,
+      "grad_norm": 1.043227195739746,
+      "learning_rate": 3.125e-05,
+      "loss": 6.1267,
+      "step": 1000
+    },
+    {
+      "epoch": 0.12836970474967907,
+      "grad_norm": 1.0838656425476074,
+      "learning_rate": 6.25e-05,
+      "loss": 4.5193,
+      "step": 2000
+    },
+    {
+      "epoch": 0.1925545571245186,
+      "grad_norm": 0.9696599841117859,
+      "learning_rate": 9.375e-05,
+      "loss": 4.1718,
+      "step": 3000
+    },
+    {
+      "epoch": 0.25673940949935814,
+      "grad_norm": 0.8352647423744202,
+      "learning_rate": 0.000125,
+      "loss": 3.9406,
+      "step": 4000
+    },
+    {
+      "epoch": 0.3209242618741977,
+      "grad_norm": 0.7503564953804016,
+      "learning_rate": 0.00015625,
+      "loss": 3.779,
+      "step": 5000
+    },
+    {
+      "epoch": 0.3851091142490372,
+      "grad_norm": 0.7538415193557739,
+      "learning_rate": 0.0001875,
+      "loss": 3.6577,
+      "step": 6000
+    },
+    {
+      "epoch": 0.4492939666238768,
+      "grad_norm": 0.6559199690818787,
+      "learning_rate": 0.00021875,
+      "loss": 3.5747,
+      "step": 7000
+    },
+    {
+      "epoch": 0.5134788189987163,
+      "grad_norm": 0.7036673426628113,
+      "learning_rate": 0.00025,
+      "loss": 3.5069,
+      "step": 8000
+    },
+    {
+      "epoch": 0.5776636713735558,
+      "grad_norm": 0.6067727208137512,
+      "learning_rate": 0.00028125000000000003,
+      "loss": 3.4443,
+      "step": 9000
+    },
+    {
+      "epoch": 0.6418485237483954,
+      "grad_norm": 0.5849200487136841,
+      "learning_rate": 0.0003125,
+      "loss": 3.4028,
+      "step": 10000
+    },
+    {
+      "epoch": 0.7060333761232349,
+      "grad_norm": 0.5407277345657349,
+      "learning_rate": 0.00034371875,
+      "loss": 3.3687,
+      "step": 11000
+    },
+    {
+      "epoch": 0.7702182284980744,
+      "grad_norm": 0.5096879601478577,
+      "learning_rate": 0.00037496875000000003,
+      "loss": 3.3356,
+      "step": 12000
+    },
+    {
+      "epoch": 0.834403080872914,
+      "grad_norm": 0.4796447455883026,
+      "learning_rate": 0.0004061875,
+      "loss": 3.2971,
+      "step": 13000
+    },
+    {
+      "epoch": 0.8985879332477535,
+      "grad_norm": 0.4423964321613312,
+      "learning_rate": 0.00043740625,
+      "loss": 3.2819,
+      "step": 14000
+    },
+    {
+      "epoch": 0.962772785622593,
+      "grad_norm": 0.3888336420059204,
+      "learning_rate": 0.00046865625,
+      "loss": 3.2533,
+      "step": 15000
+    },
+    {
+      "epoch": 1.0,
+      "eval_accuracy": 0.3827605191012575,
+      "eval_loss": 3.314154863357544,
+      "eval_runtime": 112.692,
+      "eval_samples_per_second": 467.114,
+      "eval_steps_per_second": 7.303,
+      "step": 15580
+    },
+    {
+      "epoch": 1.0269576379974326,
+      "grad_norm": 0.4023679196834564,
+      "learning_rate": 0.00049990625,
+      "loss": 3.2279,
+      "step": 16000
+    },
+    {
+      "epoch": 1.0911424903722722,
+      "grad_norm": 0.3901638686656952,
+      "learning_rate": 0.00053115625,
+      "loss": 3.1885,
+      "step": 17000
+    },
+    {
+      "epoch": 1.1553273427471118,
+      "grad_norm": 0.35213154554367065,
+      "learning_rate": 0.00056240625,
+      "loss": 3.1852,
+      "step": 18000
+    },
+    {
+      "epoch": 1.2195121951219512,
+      "grad_norm": 0.3163226246833801,
+      "learning_rate": 0.00059359375,
+      "loss": 3.1754,
+      "step": 19000
+    },
+    {
+      "epoch": 1.2836970474967908,
+      "grad_norm": 0.3097131848335266,
+      "learning_rate": 0.0006248437500000001,
+      "loss": 3.1646,
+      "step": 20000
+    },
+    {
+      "epoch": 1.3478818998716302,
+      "grad_norm": 0.30222874879837036,
+      "learning_rate": 0.00065609375,
+      "loss": 3.1492,
+      "step": 21000
+    },
+    {
+      "epoch": 1.4120667522464698,
+      "grad_norm": 0.27634382247924805,
+      "learning_rate": 0.0006873125,
+      "loss": 3.1411,
+      "step": 22000
+    },
+    {
+      "epoch": 1.4762516046213094,
+      "grad_norm": 0.27012380957603455,
+      "learning_rate": 0.0007185625,
+      "loss": 3.1381,
+      "step": 23000
+    },
+    {
+      "epoch": 1.540436456996149,
+      "grad_norm": 0.2659747004508972,
+      "learning_rate": 0.00074978125,
+      "loss": 3.1202,
+      "step": 24000
+    },
+    {
+      "epoch": 1.6046213093709885,
+      "grad_norm": 0.24373669922351837,
+      "learning_rate": 0.0007810312499999999,
+      "loss": 3.117,
+      "step": 25000
+    },
+    {
+      "epoch": 1.6688061617458279,
+      "grad_norm": 0.23827452957630157,
+      "learning_rate": 0.00081228125,
+      "loss": 3.104,
+      "step": 26000
+    },
+    {
+      "epoch": 1.7329910141206675,
+      "grad_norm": 0.2538315951824188,
+      "learning_rate": 0.00084353125,
+      "loss": 3.0942,
+      "step": 27000
+    },
+    {
+      "epoch": 1.797175866495507,
+      "grad_norm": 0.21684885025024414,
+      "learning_rate": 0.00087478125,
+      "loss": 3.0855,
+      "step": 28000
+    },
+    {
+      "epoch": 1.8613607188703467,
+      "grad_norm": 0.2181146740913391,
+      "learning_rate": 0.000906,
+      "loss": 3.0828,
+      "step": 29000
+    },
+    {
+      "epoch": 1.925545571245186,
+      "grad_norm": 0.19399482011795044,
+      "learning_rate": 0.00093725,
+      "loss": 3.0704,
+      "step": 30000
+    },
+    {
+      "epoch": 1.9897304236200257,
+      "grad_norm": 0.21062089502811432,
+      "learning_rate": 0.00096846875,
+      "loss": 3.064,
+      "step": 31000
+    },
+    {
+      "epoch": 2.0,
+      "eval_accuracy": 0.4002464390011324,
+      "eval_loss": 3.147815465927124,
+      "eval_runtime": 113.1564,
+      "eval_samples_per_second": 465.197,
+      "eval_steps_per_second": 7.273,
+      "step": 31160
+    },
+    {
+      "epoch": 2.053915275994865,
+      "grad_norm": 0.18824858963489532,
+      "learning_rate": 0.00099971875,
+      "loss": 3.014,
+      "step": 32000
+    },
+    {
+      "epoch": 2.1181001283697047,
+      "grad_norm": 0.1937263160943985,
+      "learning_rate": 0.0009964556509298999,
+      "loss": 3.003,
+      "step": 33000
+    },
+    {
+      "epoch": 2.1822849807445444,
+      "grad_norm": 0.18498077988624573,
+      "learning_rate": 0.0009928826895565094,
+      "loss": 3.0099,
+      "step": 34000
+    },
+    {
+      "epoch": 2.246469833119384,
+      "grad_norm": 0.17105349898338318,
+      "learning_rate": 0.0009893061516452074,
+      "loss": 2.9979,
+      "step": 35000
+    },
+    {
+      "epoch": 2.3106546854942236,
+      "grad_norm": 0.17657169699668884,
+      "learning_rate": 0.0009857331902718169,
+      "loss": 2.994,
+      "step": 36000
+    },
+    {
+      "epoch": 2.3748395378690628,
+      "grad_norm": 0.2064744532108307,
+      "learning_rate": 0.000982156652360515,
+      "loss": 2.9866,
+      "step": 37000
+    },
+    {
+      "epoch": 2.4390243902439024,
+      "grad_norm": 0.1920684576034546,
+      "learning_rate": 0.000978580114449213,
+      "loss": 2.9776,
+      "step": 38000
+    },
+    {
+      "epoch": 2.503209242618742,
+      "grad_norm": 0.16097630560398102,
+      "learning_rate": 0.0009750035765379113,
+      "loss": 2.9774,
+      "step": 39000
+    },
+    {
+      "epoch": 2.5673940949935816,
+      "grad_norm": 0.18222209811210632,
+      "learning_rate": 0.0009714306151645208,
+      "loss": 2.9705,
+      "step": 40000
+    },
+    {
+      "epoch": 2.6315789473684212,
+      "grad_norm": 0.17541208863258362,
+      "learning_rate": 0.000967854077253219,
+      "loss": 2.9642,
+      "step": 41000
+    },
+    {
+      "epoch": 2.6957637997432604,
+      "grad_norm": 0.15682058036327362,
+      "learning_rate": 0.0009642775393419171,
+      "loss": 2.9563,
+      "step": 42000
+    },
+    {
+      "epoch": 2.7599486521181,
+      "grad_norm": 0.1775752454996109,
+      "learning_rate": 0.0009607045779685265,
+      "loss": 2.9493,
+      "step": 43000
+    },
+    {
+      "epoch": 2.8241335044929397,
+      "grad_norm": 0.19568467140197754,
+      "learning_rate": 0.0009571280400572247,
+      "loss": 2.9404,
+      "step": 44000
+    },
+    {
+      "epoch": 2.8883183568677793,
+      "grad_norm": 0.1565084159374237,
+      "learning_rate": 0.0009535550786838341,
+      "loss": 2.9451,
+      "step": 45000
+    },
+    {
+      "epoch": 2.952503209242619,
+      "grad_norm": 0.1869024932384491,
+      "learning_rate": 0.0009499785407725322,
+      "loss": 2.9372,
+      "step": 46000
+    },
+    {
+      "epoch": 3.0,
+      "eval_accuracy": 0.41228246617796055,
+      "eval_loss": 3.0373053550720215,
+      "eval_runtime": 113.4953,
+      "eval_samples_per_second": 463.808,
+      "eval_steps_per_second": 7.251,
+      "step": 46740
+    },
+    {
+      "epoch": 3.016688061617458,
+      "grad_norm": 0.16253632307052612,
+      "learning_rate": 0.0009464055793991416,
+      "loss": 2.9111,
+      "step": 47000
+    },
+    {
+      "epoch": 3.0808729139922977,
+      "grad_norm": 0.15888482332229614,
+      "learning_rate": 0.0009428290414878398,
+      "loss": 2.8581,
+      "step": 48000
+    },
+    {
+      "epoch": 3.1450577663671373,
+      "grad_norm": 0.1883237361907959,
+      "learning_rate": 0.0009392525035765379,
+      "loss": 2.8599,
+      "step": 49000
+    },
+    {
+      "epoch": 3.209242618741977,
+      "grad_norm": 0.1678224354982376,
+      "learning_rate": 0.0009356831187410587,
+      "loss": 2.8651,
+      "step": 50000
+    },
+    {
+      "epoch": 3.2734274711168165,
+      "grad_norm": 0.1709676831960678,
+      "learning_rate": 0.0009321065808297568,
+      "loss": 2.8641,
+      "step": 51000
+    },
+    {
+      "epoch": 3.337612323491656,
+      "grad_norm": 0.1678624153137207,
+      "learning_rate": 0.000928530042918455,
+      "loss": 2.8589,
+      "step": 52000
+    },
+    {
+      "epoch": 3.4017971758664953,
+      "grad_norm": 0.19356407225131989,
+      "learning_rate": 0.0009249535050071531,
+      "loss": 2.866,
+      "step": 53000
+    },
+    {
+      "epoch": 3.465982028241335,
+      "grad_norm": NaN,
+      "learning_rate": 0.0009213805436337625,
+      "loss": 2.8603,
+      "step": 54000
+    },
+    {
+      "epoch": 3.5301668806161746,
+      "grad_norm": 0.19353008270263672,
+      "learning_rate": 0.0009178040057224607,
+      "loss": 2.8639,
+      "step": 55000
+    },
+    {
+      "epoch": 3.594351732991014,
+      "grad_norm": 0.16915424168109894,
+      "learning_rate": 0.0009142274678111588,
+      "loss": 2.8607,
+      "step": 56000
+    },
+    {
+      "epoch": 3.658536585365854,
+      "grad_norm": 0.16573752462863922,
+      "learning_rate": 0.0009106545064377682,
+      "loss": 2.8589,
+      "step": 57000
+    },
+    {
+      "epoch": 3.7227214377406934,
+      "grad_norm": 0.1988651305437088,
+      "learning_rate": 0.0009070779685264664,
+      "loss": 2.8545,
+      "step": 58000
+    },
+    {
+      "epoch": 3.7869062901155326,
+      "grad_norm": 0.16490334272384644,
+      "learning_rate": 0.0009035050071530759,
+      "loss": 2.8589,
+      "step": 59000
+    },
+    {
+      "epoch": 3.851091142490372,
+      "grad_norm": 0.1708660125732422,
+      "learning_rate": 0.0008999284692417739,
+      "loss": 2.8569,
+      "step": 60000
+    },
+    {
+      "epoch": 3.915275994865212,
+      "grad_norm": 0.20747113227844238,
+      "learning_rate": 0.0008963555078683834,
+      "loss": 2.8523,
+      "step": 61000
+    },
+    {
+      "epoch": 3.9794608472400514,
+      "grad_norm": 0.1617908626794815,
+      "learning_rate": 0.0008927789699570816,
+      "loss": 2.8458,
+      "step": 62000
+    },
+    {
+      "epoch": 4.0,
+      "eval_accuracy": 0.41800278622087134,
+      "eval_loss": 2.9891786575317383,
+      "eval_runtime": 113.4111,
+      "eval_samples_per_second": 464.152,
+      "eval_steps_per_second": 7.257,
+      "step": 62320
+    },
+    {
+      "epoch": 4.043645699614891,
+      "grad_norm": 0.20924492180347443,
+      "learning_rate": 0.0008892060085836911,
+      "loss": 2.7891,
+      "step": 63000
+    },
+    {
+      "epoch": 4.10783055198973,
+      "grad_norm": 0.18781903386116028,
+      "learning_rate": 0.0008856294706723892,
+      "loss": 2.7725,
+      "step": 64000
+    },
+    {
+      "epoch": 4.17201540436457,
+      "grad_norm": 0.23733529448509216,
+      "learning_rate": 0.0008820529327610874,
+      "loss": 2.778,
+      "step": 65000
+    },
+    {
+      "epoch": 4.2362002567394095,
+      "grad_norm": 0.19477002322673798,
+      "learning_rate": 0.0008784763948497855,
+      "loss": 2.7777,
+      "step": 66000
+    },
+    {
+      "epoch": 4.300385109114249,
+      "grad_norm": 0.17136947810649872,
+      "learning_rate": 0.0008749034334763949,
+      "loss": 2.7804,
+      "step": 67000
+    },
+    {
+      "epoch": 4.364569961489089,
+      "grad_norm": 0.25313475728034973,
+      "learning_rate": 0.0008713268955650931,
+      "loss": 2.7886,
+      "step": 68000
+    },
+    {
+      "epoch": 4.428754813863928,
+      "grad_norm": 0.1842871755361557,
+      "learning_rate": 0.0008677503576537912,
+      "loss": 2.7803,
+      "step": 69000
+    },
+    {
+      "epoch": 4.492939666238768,
+      "grad_norm": 0.2282925546169281,
+      "learning_rate": 0.0008641773962804006,
+      "loss": 2.7881,
+      "step": 70000
+    },
+    {
+      "epoch": 4.557124518613607,
+      "grad_norm": 0.17763297259807587,
+      "learning_rate": 0.0008606008583690988,
+      "loss": 2.7897,
+      "step": 71000
+    },
+    {
+      "epoch": 4.621309370988447,
+      "grad_norm": 0.17076101899147034,
+      "learning_rate": 0.0008570278969957082,
+      "loss": 2.7892,
+      "step": 72000
+    },
+    {
+      "epoch": 4.685494223363286,
+      "grad_norm": 0.20617830753326416,
+      "learning_rate": 0.0008534513590844063,
+      "loss": 2.7934,
+      "step": 73000
+    },
+    {
+      "epoch": 4.7496790757381255,
+      "grad_norm": 0.17589814960956573,
+      "learning_rate": 0.0008498783977110157,
+      "loss": 2.7888,
+      "step": 74000
+    },
+    {
+      "epoch": 4.813863928112966,
+      "grad_norm": 0.17282748222351074,
+      "learning_rate": 0.0008463018597997139,
+      "loss": 2.7928,
+      "step": 75000
+    },
+    {
+      "epoch": 4.878048780487805,
+      "grad_norm": 0.1879618763923645,
+      "learning_rate": 0.000842725321888412,
+      "loss": 2.7842,
+      "step": 76000
+    },
+    {
+      "epoch": 4.942233632862644,
+      "grad_norm": 0.27251970767974854,
+      "learning_rate": 0.0008391523605150215,
+      "loss": 2.7931,
+      "step": 77000
+    },
+    {
+      "epoch": 5.0,
+      "eval_accuracy": 0.4212023958519578,
+      "eval_loss": 2.9617300033569336,
+      "eval_runtime": 113.1396,
+      "eval_samples_per_second": 465.266,
+      "eval_steps_per_second": 7.274,
+      "step": 77900
+    },
+    {
+      "epoch": 5.006418485237484,
+      "grad_norm": 0.17574380338191986,
+      "learning_rate": 0.0008355793991416309,
+      "loss": 2.7806,
+      "step": 78000
+    },
+    {
+      "epoch": 5.070603337612323,
+      "grad_norm": 0.1976320743560791,
+      "learning_rate": 0.0008320028612303291,
+      "loss": 2.6979,
+      "step": 79000
+    },
+    {
+      "epoch": 5.134788189987163,
+      "grad_norm": 0.17811007797718048,
+      "learning_rate": 0.0008284263233190272,
+      "loss": 2.7114,
+      "step": 80000
+    },
+    {
+      "epoch": 5.198973042362002,
+      "grad_norm": 0.19787943363189697,
+      "learning_rate": 0.0008248497854077254,
+      "loss": 2.7157,
+      "step": 81000
+    },
+    {
+      "epoch": 5.2631578947368425,
+      "grad_norm": 0.231247678399086,
+      "learning_rate": 0.0008212732474964235,
+      "loss": 2.7172,
+      "step": 82000
+    },
+    {
+      "epoch": 5.327342747111682,
+      "grad_norm": 0.17853890359401703,
+      "learning_rate": 0.0008177002861230329,
+      "loss": 2.7299,
+      "step": 83000
+    },
+    {
+      "epoch": 5.391527599486521,
+      "grad_norm": 0.18848580121994019,
+      "learning_rate": 0.0008141273247496423,
+      "loss": 2.7296,
+      "step": 84000
+    },
+    {
+      "epoch": 5.455712451861361,
+      "grad_norm": 0.1779264509677887,
+      "learning_rate": 0.0008105507868383405,
+      "loss": 2.7282,
+      "step": 85000
+    },
+    {
+      "epoch": 5.5198973042362,
+      "grad_norm": 0.2169097363948822,
+      "learning_rate": 0.0008069742489270386,
+      "loss": 2.7305,
+      "step": 86000
+    },
+    {
+      "epoch": 5.58408215661104,
+      "grad_norm": 0.1910111904144287,
+      "learning_rate": 0.0008033977110157368,
+      "loss": 2.7319,
+      "step": 87000
+    },
+    {
+      "epoch": 5.648267008985879,
+      "grad_norm": 0.1949514001607895,
+      "learning_rate": 0.0007998247496423462,
+      "loss": 2.7392,
+      "step": 88000
+    },
+    {
+      "epoch": 5.712451861360719,
+      "grad_norm": 0.18040545284748077,
+      "learning_rate": 0.0007962482117310443,
+      "loss": 2.7338,
+      "step": 89000
+    },
+    {
+      "epoch": 5.7766367137355585,
+      "grad_norm": 0.2376387119293213,
+      "learning_rate": 0.0007926716738197425,
+      "loss": 2.7409,
+      "step": 90000
+    },
+    {
+      "epoch": 5.840821566110398,
+      "grad_norm": 0.23671500384807587,
+      "learning_rate": 0.0007890951359084406,
+      "loss": 2.7415,
+      "step": 91000
+    },
+    {
+      "epoch": 5.905006418485238,
+      "grad_norm": 0.20994797348976135,
+      "learning_rate": 0.00078552217453505,
+      "loss": 2.7396,
+      "step": 92000
+    },
+    {
+      "epoch": 5.969191270860077,
+      "grad_norm": 0.18293611705303192,
+      "learning_rate": 0.0007819456366237482,
+      "loss": 2.7418,
+      "step": 93000
+    },
+    {
+      "epoch": 6.0,
+      "eval_accuracy": 0.42353231718219203,
+      "eval_loss": 2.9472451210021973,
+      "eval_runtime": 112.282,
+      "eval_samples_per_second": 468.82,
+      "eval_steps_per_second": 7.33,
+      "step": 93480
+    },
+    {
+      "epoch": 6.033376123234916,
+      "grad_norm": 0.23322834074497223,
+      "learning_rate": 0.0007783726752503577,
+      "loss": 2.6918,
+      "step": 94000
+    },
+    {
+      "epoch": 6.097560975609756,
+      "grad_norm": 0.2009533792734146,
+      "learning_rate": 0.0007747997138769671,
+      "loss": 2.6555,
+      "step": 95000
+    },
+    {
+      "epoch": 6.161745827984595,
+      "grad_norm": 0.2933996319770813,
+      "learning_rate": 0.0007712231759656652,
+      "loss": 2.6563,
+      "step": 96000
+    },
+    {
+      "epoch": 6.225930680359435,
+      "grad_norm": 0.20901496708393097,
+      "learning_rate": 0.0007676466380543634,
+      "loss": 2.6716,
+      "step": 97000
+    },
+    {
+      "epoch": 6.290115532734275,
+      "grad_norm": 0.19593417644500732,
+      "learning_rate": 0.0007640701001430615,
+      "loss": 2.6735,
+      "step": 98000
+    },
+    {
+      "epoch": 6.354300385109115,
+      "grad_norm": 0.22803381085395813,
+      "learning_rate": 0.0007604935622317597,
+      "loss": 2.6748,
+      "step": 99000
+    },
+    {
+      "epoch": 6.418485237483954,
+      "grad_norm": 0.22151826322078705,
+      "learning_rate": 0.0007569206008583691,
+      "loss": 2.6874,
+      "step": 100000
+    },
+    {
+      "epoch": 6.482670089858793,
+      "grad_norm": 0.21168138086795807,
+      "learning_rate": 0.0007533476394849786,
+      "loss": 2.6896,
+      "step": 101000
+    },
+    {
+      "epoch": 6.546854942233633,
+      "grad_norm": 0.20149804651737213,
+      "learning_rate": 0.0007497711015736766,
+      "loss": 2.6884,
+      "step": 102000
+    },
+    {
+      "epoch": 6.611039794608472,
+      "grad_norm": 0.1844107061624527,
+      "learning_rate": 0.0007461945636623749,
+      "loss": 2.6858,
+      "step": 103000
+    },
+    {
+      "epoch": 6.675224646983312,
+      "grad_norm": 0.19826510548591614,
+      "learning_rate": 0.0007426180257510729,
+      "loss": 2.6913,
+      "step": 104000
+    },
+    {
+      "epoch": 6.7394094993581515,
+      "grad_norm": 0.19884894788265228,
+      "learning_rate": 0.0007390414878397711,
+      "loss": 2.6914,
+      "step": 105000
+    },
+    {
+      "epoch": 6.803594351732991,
+      "grad_norm": 0.18995630741119385,
+      "learning_rate": 0.0007354649499284692,
+      "loss": 2.6991,
+      "step": 106000
+    },
+    {
+      "epoch": 6.867779204107831,
+      "grad_norm": 0.18595723807811737,
+      "learning_rate": 0.0007318919885550786,
+      "loss": 2.6989,
+      "step": 107000
+    },
+    {
+      "epoch": 6.93196405648267,
+      "grad_norm": 0.22208259999752045,
+      "learning_rate": 0.0007283154506437768,
+      "loss": 2.6995,
+      "step": 108000
+    },
+    {
+      "epoch": 6.99614890885751,
+      "grad_norm": 0.2557336390018463,
+      "learning_rate": 0.0007247424892703864,
+      "loss": 2.7019,
+      "step": 109000
+    },
+    {
+      "epoch": 7.0,
+      "eval_accuracy": 0.4251007956374039,
+      "eval_loss": 2.938654661178589,
+      "eval_runtime": 113.1288,
+      "eval_samples_per_second": 465.31,
+      "eval_steps_per_second": 7.275,
+      "step": 109060
+    },
+    {
+      "epoch": 7.060333761232349,
+      "grad_norm": 0.2100907266139984,
+      "learning_rate": 0.0007211659513590845,
+      "loss": 2.6144,
+      "step": 110000
+    },
+    {
+      "epoch": 7.124518613607188,
+      "grad_norm": 0.20888903737068176,
+      "learning_rate": 0.0007175929899856939,
+      "loss": 2.6195,
+      "step": 111000
+    },
+    {
+      "epoch": 7.188703465982028,
+      "grad_norm": 0.2282038778066635,
+      "learning_rate": 0.0007140164520743921,
+      "loss": 2.6274,
+      "step": 112000
+    },
+    {
+      "epoch": 7.2528883183568675,
+      "grad_norm": 0.20698107779026031,
+      "learning_rate": 0.0007104434907010015,
+      "loss": 2.6321,
+      "step": 113000
+    },
+    {
+      "epoch": 7.317073170731708,
+      "grad_norm": 0.2202378660440445,
+      "learning_rate": 0.0007068669527896996,
+      "loss": 2.6296,
+      "step": 114000
+    },
+    {
+      "epoch": 7.381258023106547,
+      "grad_norm": 0.30359992384910583,
+      "learning_rate": 0.0007032904148783978,
+      "loss": 2.6413,
+      "step": 115000
+    },
+    {
+      "epoch": 7.445442875481387,
+      "grad_norm": 0.18242676556110382,
+      "learning_rate": 0.0006997210300429184,
+      "loss": 2.6454,
+      "step": 116000
+    },
+    {
+      "epoch": 7.509627727856226,
+      "grad_norm": 0.28255388140678406,
+      "learning_rate": 0.0006961444921316166,
+      "loss": 2.6507,
+      "step": 117000
+    },
+    {
+      "epoch": 7.573812580231065,
+      "grad_norm": 0.21654346585273743,
+      "learning_rate": 0.0006925679542203147,
+      "loss": 2.6457,
+      "step": 118000
+    },
+    {
+      "epoch": 7.637997432605905,
+      "grad_norm": 0.21898731589317322,
+      "learning_rate": 0.0006889914163090129,
+      "loss": 2.6465,
+      "step": 119000
+    },
+    {
+      "epoch": 7.702182284980744,
+      "grad_norm": 0.3429543972015381,
+      "learning_rate": 0.000685414878397711,
+      "loss": 2.6531,
+      "step": 120000
+    },
+    {
+      "epoch": 7.766367137355584,
+      "grad_norm": 0.20613309741020203,
+      "learning_rate": 0.0006818454935622318,
+      "loss": 2.6509,
+      "step": 121000
+    },
+    {
+      "epoch": 7.830551989730424,
+      "grad_norm": 0.2630804777145386,
+      "learning_rate": 0.0006782689556509299,
+      "loss": 2.6599,
+      "step": 122000
+    },
+    {
+      "epoch": 7.894736842105263,
+      "grad_norm": 0.22952473163604736,
+      "learning_rate": 0.0006746924177396281,
+      "loss": 2.6688,
+      "step": 123000
+    },
+    {
+      "epoch": 7.958921694480103,
+      "grad_norm": 0.21582633256912231,
+      "learning_rate": 0.0006711158798283262,
+      "loss": 2.6555,
+      "step": 124000
+    },
+    {
+      "epoch": 8.0,
+      "eval_accuracy": 0.42585136182132427,
+      "eval_loss": 2.938795566558838,
+      "eval_runtime": 112.5269,
+      "eval_samples_per_second": 467.799,
+      "eval_steps_per_second": 7.314,
+      "step": 124640
+    },
+    {
+      "epoch": 8.023106546854942,
+      "grad_norm": 0.21312978863716125,
+      "learning_rate": 0.0006675429184549356,
+      "loss": 2.6305,
+      "step": 125000
+    },
+    {
+      "epoch": 8.087291399229782,
+      "grad_norm": 0.20984013378620148,
+      "learning_rate": 0.0006639663805436338,
+      "loss": 2.579,
+      "step": 126000
+    },
+    {
+      "epoch": 8.15147625160462,
+      "grad_norm": 0.20877642929553986,
+      "learning_rate": 0.0006603934191702432,
+      "loss": 2.581,
+      "step": 127000
+    },
+    {
+      "epoch": 8.21566110397946,
+      "grad_norm": 0.22023415565490723,
+      "learning_rate": 0.0006568168812589413,
+      "loss": 2.5914,
+      "step": 128000
+    },
+    {
+      "epoch": 8.2798459563543,
+      "grad_norm": 0.36641648411750793,
+      "learning_rate": 0.0006532439198855507,
+      "loss": 2.6003,
+      "step": 129000
+    },
+    {
+      "epoch": 8.34403080872914,
+      "grad_norm": 0.21248340606689453,
+      "learning_rate": 0.0006496673819742489,
+      "loss": 2.6038,
+      "step": 130000
+    },
+    {
+      "epoch": 8.408215661103979,
+      "grad_norm": 0.2076880782842636,
+      "learning_rate": 0.000646090844062947,
+      "loss": 2.6018,
+      "step": 131000
+    },
+    {
+      "epoch": 8.472400513478819,
+      "grad_norm": 0.22432173788547516,
+      "learning_rate": 0.0006425143061516452,
+      "loss": 2.6112,
+      "step": 132000
+    },
+    {
+      "epoch": 8.536585365853659,
+      "grad_norm": 0.21105310320854187,
+      "learning_rate": 0.0006389377682403433,
+      "loss": 2.6137,
+      "step": 133000
+    },
+    {
+      "epoch": 8.600770218228497,
+      "grad_norm": 0.1929914504289627,
+      "learning_rate": 0.0006353683834048642,
+      "loss": 2.6134,
+      "step": 134000
+    },
+    {
+      "epoch": 8.664955070603337,
+      "grad_norm": 0.2183075249195099,
+      "learning_rate": 0.0006317918454935622,
+      "loss": 2.62,
+      "step": 135000
+    },
+    {
+      "epoch": 8.729139922978177,
+      "grad_norm": 0.2047470211982727,
+      "learning_rate": 0.0006282153075822604,
+      "loss": 2.619,
+      "step": 136000
+    },
+    {
+      "epoch": 8.793324775353017,
+      "grad_norm": 0.1994733363389969,
+      "learning_rate": 0.0006246387696709585,
+      "loss": 2.6245,
+      "step": 137000
+    },
+    {
+      "epoch": 8.857509627727856,
+      "grad_norm": 0.28145477175712585,
+      "learning_rate": 0.000621065808297568,
+      "loss": 2.6199,
+      "step": 138000
+    },
+    {
+      "epoch": 8.921694480102696,
+      "grad_norm": 0.20502200722694397,
+      "learning_rate": 0.0006174928469241774,
+      "loss": 2.624,
+      "step": 139000
+    },
+    {
+      "epoch": 8.985879332477536,
+      "grad_norm": 0.21982279419898987,
+      "learning_rate": 0.0006139163090128756,
+      "loss": 2.6321,
+      "step": 140000
+    },
+    {
+      "epoch": 9.0,
+      "eval_accuracy": 0.4262641545980094,
+      "eval_loss": 2.9394068717956543,
+      "eval_runtime": 112.5515,
+      "eval_samples_per_second": 467.697,
+      "eval_steps_per_second": 7.312,
+      "step": 140220
+    },
+    {
+      "epoch": 9.050064184852374,
+      "grad_norm": 0.24569900333881378,
+      "learning_rate": 0.000610343347639485,
+      "loss": 2.5553,
+      "step": 141000
+    },
+    {
+      "epoch": 9.114249037227214,
+      "grad_norm": 0.22856919467449188,
+      "learning_rate": 0.0006067668097281831,
+      "loss": 2.5497,
+      "step": 142000
+    },
+    {
+      "epoch": 9.178433889602054,
+      "grad_norm": 0.23001812398433685,
+      "learning_rate": 0.0006031902718168813,
+      "loss": 2.5568,
+      "step": 143000
+    },
+    {
+      "epoch": 9.242618741976893,
+      "grad_norm": 0.2656523287296295,
+      "learning_rate": 0.0005996137339055794,
+      "loss": 2.5636,
+      "step": 144000
+    },
+    {
+      "epoch": 9.306803594351733,
+      "grad_norm": 0.23091475665569305,
+      "learning_rate": 0.0005960371959942776,
+      "loss": 2.568,
+      "step": 145000
+    },
+    {
+      "epoch": 9.370988446726573,
+      "grad_norm": 0.26248905062675476,
+      "learning_rate": 0.0005924678111587983,
+      "loss": 2.5739,
+      "step": 146000
+    },
+    {
+      "epoch": 9.435173299101413,
+      "grad_norm": 0.21125547587871552,
+      "learning_rate": 0.0005888912732474965,
+      "loss": 2.5724,
+      "step": 147000
+    },
+    {
+      "epoch": 9.499358151476251,
+      "grad_norm": 0.22728748619556427,
+      "learning_rate": 0.0005853147353361946,
+      "loss": 2.5749,
+      "step": 148000
+    },
+    {
+      "epoch": 9.563543003851091,
+      "grad_norm": 0.22006183862686157,
+      "learning_rate": 0.0005817381974248928,
+      "loss": 2.5791,
+      "step": 149000
+    },
+    {
+      "epoch": 9.627727856225931,
+      "grad_norm": 0.3410221338272095,
+      "learning_rate": 0.0005781652360515022,
+      "loss": 2.5892,
+      "step": 150000
+    },
+    {
+      "epoch": 9.69191270860077,
+      "grad_norm": 0.22049233317375183,
+      "learning_rate": 0.0005745886981402003,
+      "loss": 2.588,
+      "step": 151000
+    },
+    {
+      "epoch": 9.75609756097561,
+      "grad_norm": 0.22515858709812164,
+      "learning_rate": 0.0005710121602288985,
+      "loss": 2.5873,
+      "step": 152000
+    },
+    {
+      "epoch": 9.82028241335045,
+      "grad_norm": 0.20887359976768494,
+      "learning_rate": 0.0005674391988555079,
+      "loss": 2.5967,
+      "step": 153000
+    },
+    {
+      "epoch": 9.88446726572529,
+      "grad_norm": 0.2514614760875702,
+      "learning_rate": 0.0005638662374821173,
+      "loss": 2.5889,
+      "step": 154000
+    },
+    {
+      "epoch": 9.948652118100128,
+      "grad_norm": 0.28176066279411316,
+      "learning_rate": 0.0005602896995708154,
+      "loss": 2.5968,
+      "step": 155000
+    },
+    {
+      "epoch": 10.0,
+      "eval_accuracy": 0.4265262381548364,
+      "eval_loss": 2.9411489963531494,
+      "eval_runtime": 112.5273,
+      "eval_samples_per_second": 467.797,
+      "eval_steps_per_second": 7.314,
+      "step": 155800
+    },
+    {
+      "epoch": 10.012836970474968,
+      "grad_norm": 0.2588973641395569,
+      "learning_rate": 0.0005567131616595136,
+      "loss": 2.577,
+      "step": 156000
+    },
+    {
+      "epoch": 10.077021822849808,
+      "grad_norm": 0.23765428364276886,
+      "learning_rate": 0.0005531366237482117,
+      "loss": 2.5107,
+      "step": 157000
+    },
+    {
+      "epoch": 10.141206675224646,
+      "grad_norm": 0.4186215102672577,
+      "learning_rate": 0.0005495636623748211,
+      "loss": 2.5149,
+      "step": 158000
+    },
+    {
+      "epoch": 10.205391527599486,
+      "grad_norm": 0.2386707216501236,
+      "learning_rate": 0.0005459871244635193,
+      "loss": 2.5323,
+      "step": 159000
+    },
+    {
+      "epoch": 10.269576379974326,
+      "grad_norm": 0.24114102125167847,
+      "learning_rate": 0.0005424141630901288,
+      "loss": 2.5318,
+      "step": 160000
+    },
+    {
+      "epoch": 10.333761232349165,
+      "grad_norm": 0.2539213299751282,
+      "learning_rate": 0.0005388376251788269,
+      "loss": 2.5398,
+      "step": 161000
+    },
+    {
+      "epoch": 10.397946084724005,
+      "grad_norm": 0.22287596762180328,
+      "learning_rate": 0.0005352646638054363,
+      "loss": 2.5398,
+      "step": 162000
+    },
+    {
+      "epoch": 10.462130937098845,
+      "grad_norm": 0.2541950047016144,
+      "learning_rate": 0.0005316917024320458,
+      "loss": 2.5513,
+      "step": 163000
+    },
+    {
+      "epoch": 10.526315789473685,
+      "grad_norm": 0.24435748159885406,
+      "learning_rate": 0.0005281151645207439,
+      "loss": 2.5499,
+      "step": 164000
+    },
+    {
+      "epoch": 10.590500641848523,
+      "grad_norm": 0.2417035698890686,
+      "learning_rate": 0.000524538626609442,
+      "loss": 2.5547,
+      "step": 165000
+    },
+    {
+      "epoch": 10.654685494223363,
+      "grad_norm": 0.22597502171993256,
+      "learning_rate": 0.0005209620886981402,
+      "loss": 2.5551,
+      "step": 166000
+    },
+    {
+      "epoch": 10.718870346598203,
+      "grad_norm": 0.24076290428638458,
+      "learning_rate": 0.0005173855507868383,
+      "loss": 2.5559,
+      "step": 167000
+    },
+    {
+      "epoch": 10.783055198973042,
+      "grad_norm": 0.23913373053073883,
+      "learning_rate": 0.0005138125894134477,
+      "loss": 2.561,
+      "step": 168000
+    },
+    {
+      "epoch": 10.847240051347882,
+      "grad_norm": 0.22141295671463013,
+      "learning_rate": 0.0005102360515021459,
+      "loss": 2.5657,
+      "step": 169000
+    },
+    {
+      "epoch": 10.911424903722722,
+      "grad_norm": 0.23373030126094818,
+      "learning_rate": 0.0005066630901287553,
+      "loss": 2.5634,
+      "step": 170000
+    },
+    {
+      "epoch": 10.975609756097562,
+      "grad_norm": 0.30720722675323486,
+      "learning_rate": 0.0005030865522174534,
+      "loss": 2.5705,
+      "step": 171000
+    },
+    {
+      "epoch": 11.0,
+      "eval_accuracy": 0.4267387806186304,
+      "eval_loss": 2.9455363750457764,
+      "eval_runtime": 112.5954,
+      "eval_samples_per_second": 467.515,
+      "eval_steps_per_second": 7.309,
+      "step": 171380
+    },
+    {
+      "epoch": 11.0397946084724,
+      "grad_norm": 0.28773027658462524,
+      "learning_rate": 0.000499513590844063,
+      "loss": 2.5192,
+      "step": 172000
+    },
+    {
+      "epoch": 11.10397946084724,
+      "grad_norm": 0.2327316552400589,
+      "learning_rate": 0.000495937052932761,
+      "loss": 2.4889,
+      "step": 173000
+    },
+    {
+      "epoch": 11.16816431322208,
+      "grad_norm": 0.24159583449363708,
+      "learning_rate": 0.0004923605150214592,
+      "loss": 2.4953,
+      "step": 174000
+    },
+    {
+      "epoch": 11.232349165596919,
+      "grad_norm": 0.22464554011821747,
+      "learning_rate": 0.0004887875536480687,
+      "loss": 2.4973,
+      "step": 175000
+    },
+    {
+      "epoch": 11.296534017971759,
+      "grad_norm": 0.24839630722999573,
+      "learning_rate": 0.0004852110157367668,
+      "loss": 2.5079,
+      "step": 176000
+    },
+    {
+      "epoch": 11.360718870346599,
+      "grad_norm": 0.28848448395729065,
+      "learning_rate": 0.00048163805436337627,
+      "loss": 2.5107,
+      "step": 177000
+    },
+    {
+      "epoch": 11.424903722721437,
+      "grad_norm": 0.24984413385391235,
+      "learning_rate": 0.0004780615164520744,
+      "loss": 2.5192,
+      "step": 178000
+    },
+    {
+      "epoch": 11.489088575096277,
+      "grad_norm": 0.260431170463562,
+      "learning_rate": 0.0004744885550786838,
+      "loss": 2.5257,
+      "step": 179000
+    },
+    {
+      "epoch": 11.553273427471117,
+      "grad_norm": 0.25838425755500793,
+      "learning_rate": 0.00047091201716738197,
+      "loss": 2.5248,
+      "step": 180000
+    },
+    {
+      "epoch": 11.617458279845957,
+      "grad_norm": 0.25366073846817017,
+      "learning_rate": 0.0004673354792560801,
+      "loss": 2.5294,
+      "step": 181000
+    },
+    {
+      "epoch": 11.681643132220795,
+      "grad_norm": 0.2690897583961487,
+      "learning_rate": 0.00046376609442060085,
+      "loss": 2.5306,
+      "step": 182000
+    },
+    {
+      "epoch": 11.745827984595635,
+      "grad_norm": 0.22692571580410004,
+      "learning_rate": 0.000460189556509299,
+      "loss": 2.5306,
+      "step": 183000
+    },
+    {
+      "epoch": 11.810012836970476,
+      "grad_norm": 0.26649022102355957,
+      "learning_rate": 0.00045661301859799714,
+      "loss": 2.5373,
+      "step": 184000
+    },
+    {
+      "epoch": 11.874197689345314,
+      "grad_norm": 0.23332759737968445,
+      "learning_rate": 0.0004530364806866953,
+      "loss": 2.5345,
+      "step": 185000
+    },
+    {
+      "epoch": 11.938382541720154,
+      "grad_norm": 0.2456694394350052,
+      "learning_rate": 0.0004494635193133047,
+      "loss": 2.5402,
+      "step": 186000
+    },
+    {
+      "epoch": 12.0,
+      "eval_accuracy": 0.4266437213183146,
+      "eval_loss": 2.9544591903686523,
+      "eval_runtime": 112.4087,
+      "eval_samples_per_second": 468.291,
+      "eval_steps_per_second": 7.321,
+      "step": 186960
+    },
+    {
+      "epoch": 12.002567394094994,
+      "grad_norm": 0.27514010667800903,
+      "learning_rate": 0.00044589055793991416,
+      "loss": 2.5287,
+      "step": 187000
+    },
+    {
+      "epoch": 12.066752246469832,
+      "grad_norm": 0.23540076613426208,
+      "learning_rate": 0.0004423140200286123,
+      "loss": 2.4584,
+      "step": 188000
+    },
+    {
+      "epoch": 12.130937098844672,
+      "grad_norm": 0.2580265402793884,
+      "learning_rate": 0.00043873748211731045,
+      "loss": 2.4659,
+      "step": 189000
+    },
+    {
+      "epoch": 12.195121951219512,
+      "grad_norm": 0.2443842738866806,
+      "learning_rate": 0.0004351609442060086,
+      "loss": 2.4725,
+      "step": 190000
+    },
+    {
+      "epoch": 12.259306803594352,
+      "grad_norm": 0.24378903210163116,
+      "learning_rate": 0.000431587982832618,
+      "loss": 2.4779,
+      "step": 191000
+    },
+    {
+      "epoch": 12.32349165596919,
+      "grad_norm": 0.24195650219917297,
+      "learning_rate": 0.00042801144492131616,
+      "loss": 2.4849,
+      "step": 192000
+    },
+    {
+      "epoch": 12.38767650834403,
+      "grad_norm": 0.24276842176914215,
+      "learning_rate": 0.0004244349070100143,
+      "loss": 2.4877,
+      "step": 193000
+    },
+    {
+      "epoch": 12.45186136071887,
+      "grad_norm": 0.26197871565818787,
+      "learning_rate": 0.00042086194563662377,
+      "loss": 2.491,
+      "step": 194000
+    },
+    {
+      "epoch": 12.51604621309371,
+      "grad_norm": 0.2582203149795532,
+      "learning_rate": 0.0004172854077253219,
+      "loss": 2.4985,
+      "step": 195000
+    },
+    {
+      "epoch": 12.58023106546855,
+      "grad_norm": 0.24834005534648895,
+      "learning_rate": 0.0004137124463519313,
+      "loss": 2.498,
+      "step": 196000
+    },
+    {
+      "epoch": 12.64441591784339,
+      "grad_norm": 0.3061864376068115,
+      "learning_rate": 0.0004101359084406295,
+      "loss": 2.4973,
+      "step": 197000
+    },
+    {
+      "epoch": 12.70860077021823,
+      "grad_norm": 0.30300888419151306,
+      "learning_rate": 0.0004065593705293276,
+      "loss": 2.4992,
+      "step": 198000
+    },
+    {
+      "epoch": 12.772785622593068,
+      "grad_norm": 0.24682293832302094,
+      "learning_rate": 0.00040298640915593703,
+      "loss": 2.5085,
+      "step": 199000
+    },
+    {
+      "epoch": 12.836970474967908,
+      "grad_norm": 0.24502244591712952,
+      "learning_rate": 0.0003994134477825465,
+      "loss": 2.5122,
+      "step": 200000
+    },
+    {
+      "epoch": 12.901155327342748,
+      "grad_norm": 0.23922552168369293,
+      "learning_rate": 0.00039583690987124464,
+      "loss": 2.5128,
+      "step": 201000
+    },
+    {
+      "epoch": 12.965340179717586,
+      "grad_norm": 0.2427544891834259,
+      "learning_rate": 0.0003922603719599428,
+      "loss": 2.5136,
+      "step": 202000
+    },
+    {
+      "epoch": 13.0,
+      "eval_accuracy": 0.4266956463436438,
+      "eval_loss": 2.9615485668182373,
+      "eval_runtime": 112.5448,
+      "eval_samples_per_second": 467.725,
+      "eval_steps_per_second": 7.313,
+      "step": 202540
+    },
+    {
+      "epoch": 13.029525032092426,
+      "grad_norm": 0.24927350878715515,
+      "learning_rate": 0.00038868383404864093,
+      "loss": 2.4798,
+      "step": 203000
+    },
+    {
+      "epoch": 13.093709884467266,
+      "grad_norm": 0.2634033262729645,
+      "learning_rate": 0.00038511087267525035,
+      "loss": 2.4386,
+      "step": 204000
+    },
+    {
+      "epoch": 13.157894736842104,
+      "grad_norm": 0.2785243093967438,
+      "learning_rate": 0.0003815343347639485,
+      "loss": 2.4426,
+      "step": 205000
+    },
+    {
+      "epoch": 13.222079589216944,
+      "grad_norm": 0.2686097323894501,
+      "learning_rate": 0.00037795779685264664,
+      "loss": 2.4475,
+      "step": 206000
+    },
+    {
+      "epoch": 13.286264441591785,
+      "grad_norm": 0.3374476432800293,
+      "learning_rate": 0.0003743848354792561,
+      "loss": 2.4584,
+      "step": 207000
+    },
+    {
+      "epoch": 13.350449293966625,
+      "grad_norm": 0.2582445740699768,
+      "learning_rate": 0.0003708082975679542,
+      "loss": 2.46,
+      "step": 208000
+    },
+    {
+      "epoch": 13.414634146341463,
+      "grad_norm": 0.25310373306274414,
+      "learning_rate": 0.00036723533619456366,
+      "loss": 2.4634,
+      "step": 209000
+    },
+    {
+      "epoch": 13.478818998716303,
+      "grad_norm": 0.2605305016040802,
+      "learning_rate": 0.0003636587982832618,
+      "loss": 2.4691,
+      "step": 210000
+    },
+    {
+      "epoch": 13.543003851091143,
+      "grad_norm": 0.2543717622756958,
+      "learning_rate": 0.00036008226037195995,
+      "loss": 2.4781,
+      "step": 211000
+    },
+    {
+      "epoch": 13.607188703465981,
+      "grad_norm": 0.23427140712738037,
+      "learning_rate": 0.00035650929899856936,
+      "loss": 2.4727,
+      "step": 212000
+    },
+    {
+      "epoch": 13.671373555840821,
+      "grad_norm": 0.27259138226509094,
+      "learning_rate": 0.0003529327610872675,
+      "loss": 2.4774,
+      "step": 213000
+    },
+    {
+      "epoch": 13.735558408215661,
+      "grad_norm": 0.30411496758461,
+      "learning_rate": 0.000349359799713877,
+      "loss": 2.4781,
+      "step": 214000
+    },
+    {
+      "epoch": 13.7997432605905,
+      "grad_norm": 0.24120020866394043,
+      "learning_rate": 0.0003457832618025751,
+      "loss": 2.4833,
+      "step": 215000
+    },
+    {
+      "epoch": 13.86392811296534,
+      "grad_norm": 0.36322614550590515,
+      "learning_rate": 0.00034221030042918453,
+      "loss": 2.4847,
+      "step": 216000
+    },
+    {
+      "epoch": 13.92811296534018,
+      "grad_norm": 0.2745082974433899,
+      "learning_rate": 0.0003386337625178827,
+      "loss": 2.4843,
+      "step": 217000
+    },
+    {
+      "epoch": 13.99229781771502,
+      "grad_norm": 0.31372272968292236,
+      "learning_rate": 0.0003350572246065808,
+      "loss": 2.4814,
+      "step": 218000
+    },
+    {
+      "epoch": 14.0,
+      "eval_accuracy": 0.4261248435544431,
+      "eval_loss": 2.9729630947113037,
+      "eval_runtime": 112.6175,
+      "eval_samples_per_second": 467.423,
+      "eval_steps_per_second": 7.308,
+      "step": 218120
+    },
+    {
+      "epoch": 14.056482670089858,
+      "grad_norm": 0.2546994984149933,
+      "learning_rate": 0.00033148068669527897,
+      "loss": 2.4229,
+      "step": 219000
+    },
+    {
+      "epoch": 14.120667522464698,
+      "grad_norm": 0.29706746339797974,
+      "learning_rate": 0.0003279077253218884,
+      "loss": 2.4175,
+      "step": 220000
+    },
+    {
+      "epoch": 14.184852374839538,
+      "grad_norm": 0.40730276703834534,
+      "learning_rate": 0.00032433476394849785,
+      "loss": 2.425,
+      "step": 221000
+    },
+    {
+      "epoch": 14.249037227214377,
+      "grad_norm": 0.2597196400165558,
+      "learning_rate": 0.000320758226037196,
+      "loss": 2.4344,
+      "step": 222000
+    },
+    {
+      "epoch": 14.313222079589217,
+      "grad_norm": 0.2576155662536621,
+      "learning_rate": 0.00031718168812589414,
+      "loss": 2.433,
+      "step": 223000
+    },
+    {
+      "epoch": 14.377406931964057,
+      "grad_norm": 0.2582731246948242,
+      "learning_rate": 0.0003136051502145923,
+      "loss": 2.4325,
+      "step": 224000
+    },
+    {
+      "epoch": 14.441591784338897,
+      "grad_norm": 0.25023168325424194,
+      "learning_rate": 0.0003100321888412017,
+      "loss": 2.4451,
+      "step": 225000
+    },
+    {
+      "epoch": 14.505776636713735,
+      "grad_norm": 0.3350090980529785,
+      "learning_rate": 0.00030645565092989984,
+      "loss": 2.4445,
+      "step": 226000
+    },
+    {
+      "epoch": 14.569961489088575,
+      "grad_norm": 0.30197829008102417,
+      "learning_rate": 0.0003028826895565093,
+      "loss": 2.4478,
+      "step": 227000
+    },
+    {
+      "epoch": 14.634146341463415,
+      "grad_norm": 0.2431783527135849,
+      "learning_rate": 0.00029930615164520746,
+      "loss": 2.4494,
+      "step": 228000
+    },
+    {
+      "epoch": 14.698331193838253,
+      "grad_norm": 0.2774644196033478,
+      "learning_rate": 0.0002957296137339056,
+      "loss": 2.4535,
+      "step": 229000
+    },
+    {
+      "epoch": 14.762516046213094,
+      "grad_norm": 0.27773821353912354,
+      "learning_rate": 0.000292156652360515,
+      "loss": 2.4548,
+      "step": 230000
+    },
+    {
+      "epoch": 14.826700898587934,
+      "grad_norm": 0.35124674439430237,
+      "learning_rate": 0.0002885836909871244,
+      "loss": 2.4565,
+      "step": 231000
+    },
+    {
+      "epoch": 14.890885750962774,
+      "grad_norm": 0.28358736634254456,
+      "learning_rate": 0.00028500715307582257,
+      "loss": 2.4572,
+      "step": 232000
+    },
+    {
+      "epoch": 14.955070603337612,
+      "grad_norm": 0.27410945296287537,
+      "learning_rate": 0.0002814306151645207,
+      "loss": 2.4607,
+      "step": 233000
+    },
+    {
+      "epoch": 15.0,
+      "eval_accuracy": 0.4256197478991597,
+      "eval_loss": 2.9826996326446533,
+      "eval_runtime": 112.5916,
+      "eval_samples_per_second": 467.531,
+      "eval_steps_per_second": 7.31,
+      "step": 233700
+    },
+    {
+      "epoch": 15.019255455712452,
+      "grad_norm": 0.27103665471076965,
+      "learning_rate": 0.00027785407725321886,
+      "loss": 2.4425,
+      "step": 234000
+    },
+    {
+      "epoch": 15.083440308087292,
+      "grad_norm": 0.26552310585975647,
+      "learning_rate": 0.00027428111587982833,
+      "loss": 2.3947,
+      "step": 235000
+    },
+    {
+      "epoch": 15.14762516046213,
+      "grad_norm": 0.2552954852581024,
+      "learning_rate": 0.0002707045779685265,
+      "loss": 2.4,
+      "step": 236000
+    },
+    {
+      "epoch": 15.21181001283697,
+      "grad_norm": 0.27387532591819763,
+      "learning_rate": 0.0002671280400572246,
+      "loss": 2.4109,
+      "step": 237000
+    },
+    {
+      "epoch": 15.27599486521181,
+      "grad_norm": 0.3243711292743683,
+      "learning_rate": 0.00026355150214592277,
+      "loss": 2.4071,
+      "step": 238000
+    },
+    {
+      "epoch": 15.340179717586649,
+      "grad_norm": 0.27082788944244385,
+      "learning_rate": 0.0002599785407725322,
+      "loss": 2.4118,
+      "step": 239000
+    },
+    {
+      "epoch": 15.404364569961489,
+      "grad_norm": 0.27366554737091064,
+      "learning_rate": 0.0002564055793991416,
+      "loss": 2.4123,
+      "step": 240000
+    },
+    {
+      "epoch": 15.468549422336329,
+      "grad_norm": 0.2563120722770691,
+      "learning_rate": 0.00025282904148783974,
+      "loss": 2.4177,
+      "step": 241000
+    },
+    {
+      "epoch": 15.532734274711169,
+      "grad_norm": 0.2995336949825287,
+      "learning_rate": 0.00024925250357653794,
+      "loss": 2.4236,
+      "step": 242000
+    },
+    {
+      "epoch": 15.596919127086007,
+      "grad_norm": 0.2699737548828125,
+      "learning_rate": 0.0002456759656652361,
+      "loss": 2.4274,
+      "step": 243000
+    },
+    {
+      "epoch": 15.661103979460847,
+      "grad_norm": 0.2669052481651306,
+      "learning_rate": 0.0002420994277539342,
+      "loss": 2.4262,
+      "step": 244000
+    },
+    {
+      "epoch": 15.725288831835687,
+      "grad_norm": 0.2465328872203827,
+      "learning_rate": 0.00023852646638054364,
+      "loss": 2.4262,
+      "step": 245000
+    },
+    {
+      "epoch": 15.789473684210526,
+      "grad_norm": 0.27883973717689514,
+      "learning_rate": 0.0002349499284692418,
+      "loss": 2.4294,
+      "step": 246000
+    },
+    {
+      "epoch": 15.853658536585366,
+      "grad_norm": 0.29509106278419495,
+      "learning_rate": 0.0002313769670958512,
+      "loss": 2.4359,
+      "step": 247000
+    },
+    {
+      "epoch": 15.917843388960206,
+      "grad_norm": 0.2847062945365906,
+      "learning_rate": 0.00022780042918454934,
+      "loss": 2.4355,
+      "step": 248000
+    },
+    {
+      "epoch": 15.982028241335044,
+      "grad_norm": 0.2528593838214874,
+      "learning_rate": 0.0002242238912732475,
+      "loss": 2.4394,
+      "step": 249000
+    },
+    {
+      "epoch": 16.0,
+      "eval_accuracy": 0.42555724417426544,
+      "eval_loss": 2.993483543395996,
+      "eval_runtime": 114.5219,
+      "eval_samples_per_second": 459.65,
+      "eval_steps_per_second": 7.186,
+      "step": 249280
+    },
+    {
+      "epoch": 16.046213093709884,
+      "grad_norm": 0.300592303276062,
+      "learning_rate": 0.00022065092989985693,
+      "loss": 2.3904,
+      "step": 250000
+    },
+    {
+      "epoch": 16.110397946084724,
+      "grad_norm": 0.26522037386894226,
+      "learning_rate": 0.00021707439198855508,
+      "loss": 2.3843,
+      "step": 251000
+    },
+    {
+      "epoch": 16.174582798459564,
+      "grad_norm": 0.27688267827033997,
+      "learning_rate": 0.00021350143061516454,
+      "loss": 2.3842,
+      "step": 252000
+    },
+    {
+      "epoch": 16.238767650834404,
+      "grad_norm": 0.28256699442863464,
+      "learning_rate": 0.00020992489270386266,
+      "loss": 2.3902,
+      "step": 253000
+    },
+    {
+      "epoch": 16.30295250320924,
+      "grad_norm": 0.25413182377815247,
+      "learning_rate": 0.00020635193133047212,
+      "loss": 2.3907,
+      "step": 254000
+    },
+    {
+      "epoch": 16.36713735558408,
+      "grad_norm": 0.2933899760246277,
+      "learning_rate": 0.00020277539341917024,
+      "loss": 2.3951,
+      "step": 255000
+    },
+    {
+      "epoch": 16.43132220795892,
+      "grad_norm": 0.2763221263885498,
+      "learning_rate": 0.0001992024320457797,
+      "loss": 2.3993,
+      "step": 256000
+    },
+    {
+      "epoch": 16.49550706033376,
+      "grad_norm": 0.27912837266921997,
+      "learning_rate": 0.00019562589413447783,
+      "loss": 2.3998,
+      "step": 257000
+    },
+    {
+      "epoch": 16.5596919127086,
+      "grad_norm": 0.25394317507743835,
+      "learning_rate": 0.00019204935622317595,
+      "loss": 2.3953,
+      "step": 258000
+    },
+    {
+      "epoch": 16.62387676508344,
+      "grad_norm": 0.264137327671051,
+      "learning_rate": 0.0001884728183118741,
+      "loss": 2.4006,
+      "step": 259000
+    },
+    {
+      "epoch": 16.68806161745828,
+      "grad_norm": 0.3014145791530609,
+      "learning_rate": 0.00018489985693848356,
+      "loss": 2.4017,
+      "step": 260000
+    },
+    {
+      "epoch": 16.752246469833118,
+      "grad_norm": 0.26256245374679565,
+      "learning_rate": 0.00018132331902718168,
+      "loss": 2.4029,
+      "step": 261000
+    },
+    {
+      "epoch": 16.816431322207958,
+      "grad_norm": 0.2925412654876709,
+      "learning_rate": 0.00017775035765379114,
+      "loss": 2.4056,
+      "step": 262000
+    },
+    {
+      "epoch": 16.880616174582798,
+      "grad_norm": 0.26922452449798584,
+      "learning_rate": 0.00017417381974248926,
+      "loss": 2.4098,
+      "step": 263000
+    },
+    {
+      "epoch": 16.944801026957638,
+      "grad_norm": 0.27031683921813965,
+      "learning_rate": 0.0001705972818311874,
+      "loss": 2.4062,
+      "step": 264000
+    },
+    {
+      "epoch": 17.0,
+      "eval_accuracy": 0.42530469634662377,
+      "eval_loss": 3.0018820762634277,
+      "eval_runtime": 113.4358,
+      "eval_samples_per_second": 464.051,
+      "eval_steps_per_second": 7.255,
+      "step": 264860
+    },
+    {
+      "epoch": 17.008985879332478,
+      "grad_norm": 0.2683101296424866,
+      "learning_rate": 0.00016702432045779687,
+      "loss": 2.4048,
+      "step": 265000
+    },
+    {
+      "epoch": 17.073170731707318,
+      "grad_norm": 0.3228018581867218,
+      "learning_rate": 0.0001634513590844063,
+      "loss": 2.3623,
+      "step": 266000
+    },
+    {
+      "epoch": 17.137355584082158,
+      "grad_norm": 0.2701365351676941,
+      "learning_rate": 0.00015987482117310446,
+      "loss": 2.3628,
+      "step": 267000
+    },
+    {
+      "epoch": 17.201540436456995,
+      "grad_norm": 0.28526586294174194,
+      "learning_rate": 0.00015629828326180258,
+      "loss": 2.3659,
+      "step": 268000
+    },
+    {
+      "epoch": 17.265725288831835,
+      "grad_norm": 0.2874760925769806,
+      "learning_rate": 0.0001527217453505007,
+      "loss": 2.3639,
+      "step": 269000
+    },
+    {
+      "epoch": 17.329910141206675,
+      "grad_norm": 0.2576092779636383,
+      "learning_rate": 0.00014914878397711016,
+      "loss": 2.372,
+      "step": 270000
+    },
+    {
+      "epoch": 17.394094993581515,
+      "grad_norm": 0.3425317108631134,
+      "learning_rate": 0.0001455722460658083,
+      "loss": 2.3713,
+      "step": 271000
+    },
+    {
+      "epoch": 17.458279845956355,
+      "grad_norm": 0.2755325436592102,
+      "learning_rate": 0.00014199928469241775,
+      "loss": 2.3775,
+      "step": 272000
+    },
+    {
+      "epoch": 17.522464698331195,
+      "grad_norm": 0.2874724268913269,
+      "learning_rate": 0.0001384227467811159,
+      "loss": 2.3802,
+      "step": 273000
+    },
+    {
+      "epoch": 17.586649550706035,
+      "grad_norm": 0.34283754229545593,
+      "learning_rate": 0.00013484978540772533,
+      "loss": 2.3774,
+      "step": 274000
+    },
+    {
+      "epoch": 17.65083440308087,
+      "grad_norm": 0.2768910825252533,
+      "learning_rate": 0.00013127682403433477,
+      "loss": 2.376,
+      "step": 275000
+    },
+    {
+      "epoch": 17.71501925545571,
+      "grad_norm": 0.263095498085022,
+      "learning_rate": 0.00012770028612303291,
+      "loss": 2.3822,
+      "step": 276000
+    },
+    {
+      "epoch": 17.77920410783055,
+      "grad_norm": 0.2954932749271393,
+      "learning_rate": 0.00012412374821173103,
+      "loss": 2.3805,
+      "step": 277000
+    },
+    {
+      "epoch": 17.84338896020539,
+      "grad_norm": 0.28619909286499023,
+      "learning_rate": 0.00012054721030042918,
+      "loss": 2.3861,
+      "step": 278000
+    },
+    {
+      "epoch": 17.90757381258023,
+      "grad_norm": 0.3277396857738495,
+      "learning_rate": 0.00011697067238912733,
+      "loss": 2.3902,
+      "step": 279000
+    },
+    {
+      "epoch": 17.971758664955072,
+      "grad_norm": 0.3048025965690613,
+      "learning_rate": 0.00011339771101573678,
+      "loss": 2.3845,
+      "step": 280000
+    },
+    {
+      "epoch": 18.0,
+      "eval_accuracy": 0.42455025627272186,
+      "eval_loss": 3.015594482421875,
+      "eval_runtime": 113.2739,
+      "eval_samples_per_second": 464.714,
+      "eval_steps_per_second": 7.266,
+      "step": 280440
+    },
+    {
+      "epoch": 18.035943517329912,
+      "grad_norm": 0.2825532853603363,
+      "learning_rate": 0.00010982117310443491,
+      "loss": 2.3589,
+      "step": 281000
+    },
+    {
+      "epoch": 18.10012836970475,
+      "grad_norm": 0.319431871175766,
+      "learning_rate": 0.00010624463519313304,
+      "loss": 2.3481,
+      "step": 282000
+    },
+    {
+      "epoch": 18.16431322207959,
+      "grad_norm": 0.27330195903778076,
+      "learning_rate": 0.0001026716738197425,
+      "loss": 2.3508,
+      "step": 283000
+    },
+    {
+      "epoch": 18.22849807445443,
+      "grad_norm": 0.30694159865379333,
+      "learning_rate": 9.909513590844063e-05,
+      "loss": 2.3498,
+      "step": 284000
+    },
+    {
+      "epoch": 18.29268292682927,
+      "grad_norm": 0.282497376203537,
+      "learning_rate": 9.551859799713878e-05,
+      "loss": 2.3433,
+      "step": 285000
+    },
+    {
+      "epoch": 18.35686777920411,
+      "grad_norm": 0.29808005690574646,
+      "learning_rate": 9.194206008583691e-05,
+      "loss": 2.3527,
+      "step": 286000
+    },
+    {
+      "epoch": 18.42105263157895,
+      "grad_norm": 0.27210718393325806,
+      "learning_rate": 8.837267525035765e-05,
+      "loss": 2.356,
+      "step": 287000
+    },
+    {
+      "epoch": 18.485237483953785,
+      "grad_norm": 0.3066132962703705,
+      "learning_rate": 8.47961373390558e-05,
+      "loss": 2.3604,
+      "step": 288000
+    },
+    {
+      "epoch": 18.549422336328625,
+      "grad_norm": 0.3081810176372528,
+      "learning_rate": 8.121959942775393e-05,
+      "loss": 2.353,
+      "step": 289000
+    },
+    {
+      "epoch": 18.613607188703465,
+      "grad_norm": 0.31945279240608215,
+      "learning_rate": 7.764306151645208e-05,
+      "loss": 2.3577,
+      "step": 290000
+    },
+    {
+      "epoch": 18.677792041078305,
+      "grad_norm": 0.2833667993545532,
+      "learning_rate": 7.407010014306153e-05,
+      "loss": 2.3567,
+      "step": 291000
+    },
+    {
+      "epoch": 18.741976893453145,
+      "grad_norm": 0.30316558480262756,
+      "learning_rate": 7.049356223175966e-05,
+      "loss": 2.3606,
+      "step": 292000
+    },
+    {
+      "epoch": 18.806161745827985,
+      "grad_norm": 0.2820417284965515,
+      "learning_rate": 6.69170243204578e-05,
+      "loss": 2.3615,
+      "step": 293000
+    },
+    {
+      "epoch": 18.870346598202826,
+      "grad_norm": 0.2879752516746521,
+      "learning_rate": 6.334406294706723e-05,
+      "loss": 2.3626,
+      "step": 294000
+    },
+    {
+      "epoch": 18.934531450577662,
+      "grad_norm": 0.3219081461429596,
+      "learning_rate": 5.976752503576538e-05,
+      "loss": 2.363,
+      "step": 295000
+    },
+    {
+      "epoch": 18.998716302952502,
+      "grad_norm": 0.28323307633399963,
+      "learning_rate": 5.619456366237482e-05,
+      "loss": 2.3632,
+      "step": 296000
+    },
+    {
+      "epoch": 19.0,
+      "eval_accuracy": 0.4244418618511234,
+      "eval_loss": 3.0269229412078857,
+      "eval_runtime": 113.5061,
+      "eval_samples_per_second": 463.764,
+      "eval_steps_per_second": 7.251,
+      "step": 296020
+    },
+    {
+      "epoch": 19.062901155327342,
+      "grad_norm": 0.2774474322795868,
+      "learning_rate": 5.261802575107296e-05,
+      "loss": 2.3301,
+      "step": 297000
+    },
+    {
+      "epoch": 19.127086007702182,
+      "grad_norm": 0.30794039368629456,
+      "learning_rate": 4.90414878397711e-05,
+      "loss": 2.3322,
+      "step": 298000
+    },
+    {
+      "epoch": 19.191270860077022,
+      "grad_norm": 0.3270117938518524,
+      "learning_rate": 4.546494992846924e-05,
+      "loss": 2.3376,
+      "step": 299000
+    },
+    {
+      "epoch": 19.255455712451862,
+      "grad_norm": 0.2921634614467621,
+      "learning_rate": 4.189198855507869e-05,
+      "loss": 2.3358,
+      "step": 300000
+    },
+    {
+      "epoch": 19.319640564826702,
+      "grad_norm": 0.29304492473602295,
+      "learning_rate": 3.8315450643776827e-05,
+      "loss": 2.3367,
+      "step": 301000
+    },
+    {
+      "epoch": 19.38382541720154,
+      "grad_norm": 0.2906641662120819,
+      "learning_rate": 3.4742489270386264e-05,
+      "loss": 2.3334,
+      "step": 302000
+    },
+    {
+      "epoch": 19.44801026957638,
+      "grad_norm": 0.3057336211204529,
+      "learning_rate": 3.1165951359084404e-05,
+      "loss": 2.3384,
+      "step": 303000
+    },
+    {
+      "epoch": 19.51219512195122,
+      "grad_norm": 0.2955686151981354,
+      "learning_rate": 2.7589413447782547e-05,
+      "loss": 2.3385,
+      "step": 304000
+    },
+    {
+      "epoch": 19.57637997432606,
+      "grad_norm": 0.2858632206916809,
+      "learning_rate": 2.401645207439199e-05,
+      "loss": 2.3363,
+      "step": 305000
+    },
+    {
+      "epoch": 19.6405648267009,
+      "grad_norm": 0.2869555652141571,
+      "learning_rate": 2.0439914163090128e-05,
+      "loss": 2.3394,
+      "step": 306000
+    },
+    {
+      "epoch": 19.70474967907574,
+      "grad_norm": 0.28836971521377563,
+      "learning_rate": 1.6866952789699572e-05,
+      "loss": 2.3366,
+      "step": 307000
+    },
+    {
+      "epoch": 19.76893453145058,
+      "grad_norm": 0.2759711742401123,
+      "learning_rate": 1.329041487839771e-05,
+      "loss": 2.3377,
+      "step": 308000
+    },
+    {
+      "epoch": 19.833119383825416,
+      "grad_norm": 0.2844057083129883,
+      "learning_rate": 9.71387696709585e-06,
+      "loss": 2.3386,
+      "step": 309000
+    },
+    {
+      "epoch": 19.897304236200256,
+      "grad_norm": 0.302503377199173,
+      "learning_rate": 6.140915593705294e-06,
+      "loss": 2.3388,
+      "step": 310000
+    },
+    {
+      "epoch": 19.961489088575096,
+      "grad_norm": 0.2948252558708191,
+      "learning_rate": 2.5643776824034334e-06,
+      "loss": 2.3437,
+      "step": 311000
+    },
+    {
+      "epoch": 20.0,
+      "eval_accuracy": 0.4239814649263961,
+      "eval_loss": 3.035736322402954,
+      "eval_runtime": 113.6992,
+      "eval_samples_per_second": 462.976,
+      "eval_steps_per_second": 7.238,
+      "step": 311600
+    },
+    {
+      "epoch": 20.0,
+      "step": 311600,
+      "total_flos": 1.30265052217344e+18,
+      "train_loss": 2.6546336687942524,
+      "train_runtime": 44210.8645,
+      "train_samples_per_second": 225.529,
+      "train_steps_per_second": 7.048
+    }
+  ],
+  "logging_steps": 1000,
+  "max_steps": 311600,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 5000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.30265052217344e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}