diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24764 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 17646, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 2496.8073030248033, + "learning_rate": 5.6657223796033996e-09, + "loss": 12.3922, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 2798.9639739819045, + "learning_rate": 2.8328611898017002e-08, + "loss": 12.556, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 3186.5811854816743, + "learning_rate": 5.6657223796034004e-08, + "loss": 12.6663, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 2433.131292284797, + "learning_rate": 8.4985835694051e-08, + "loss": 11.8166, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 927.7244167671865, + "learning_rate": 1.1331444759206801e-07, + "loss": 9.872, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 598.7163941919265, + "learning_rate": 1.41643059490085e-07, + "loss": 8.1112, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 593.7817469340148, + "learning_rate": 1.69971671388102e-07, + "loss": 7.6257, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 361.1842229161203, + "learning_rate": 1.9830028328611898e-07, + "loss": 6.8663, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 261.41360349547375, + "learning_rate": 2.2662889518413602e-07, + "loss": 6.297, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 294.80563694492105, + "learning_rate": 2.54957507082153e-07, + "loss": 5.8564, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 223.89911973808375, + "learning_rate": 2.8328611898017e-07, + "loss": 5.5386, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 205.52507903905445, + "learning_rate": 3.1161473087818695e-07, + "loss": 5.232, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 115.75404026789134, + "learning_rate": 3.39943342776204e-07, + "loss": 5.065, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 76.96969560974317, + "learning_rate": 3.6827195467422096e-07, + "loss": 4.862, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 80.34469970951176, + "learning_rate": 3.9660056657223797e-07, + "loss": 4.7516, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 81.7595945293725, + "learning_rate": 4.24929178470255e-07, + "loss": 4.5732, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 63.15931120806828, + "learning_rate": 4.5325779036827203e-07, + "loss": 4.4904, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 56.380024867999694, + "learning_rate": 4.815864022662889e-07, + "loss": 4.3397, + "step": 85 + }, + { + "epoch": 0.02, + "grad_norm": 60.056632903837375, + "learning_rate": 5.09915014164306e-07, + "loss": 4.2419, + "step": 90 + }, + { + "epoch": 0.02, + "grad_norm": 57.72643638091866, + "learning_rate": 5.382436260623229e-07, + "loss": 4.2126, + "step": 95 + }, + { + "epoch": 0.02, + "grad_norm": 44.8303385559054, + "learning_rate": 5.6657223796034e-07, + "loss": 4.083, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 45.71148017336654, + "learning_rate": 5.949008498583571e-07, + "loss": 3.9693, + "step": 105 + }, + { + "epoch": 0.02, + "grad_norm": 46.76823280991884, + "learning_rate": 6.232294617563739e-07, + "loss": 3.8478, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 42.27034196603908, + "learning_rate": 6.51558073654391e-07, + "loss": 3.8355, + "step": 115 + }, + { + "epoch": 0.02, + "grad_norm": 43.786683236286535, + "learning_rate": 6.79886685552408e-07, + "loss": 3.7417, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 33.38556504010887, + "learning_rate": 7.08215297450425e-07, + "loss": 3.619, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 38.77121164219573, + "learning_rate": 7.365439093484419e-07, + "loss": 3.6023, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 35.85722948165567, + "learning_rate": 7.64872521246459e-07, + "loss": 3.5028, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 34.87382466876409, + "learning_rate": 7.932011331444759e-07, + "loss": 3.3987, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 28.499096674335632, + "learning_rate": 8.215297450424931e-07, + "loss": 3.3644, + "step": 145 + }, + { + "epoch": 0.03, + "grad_norm": 26.379175262470692, + "learning_rate": 8.4985835694051e-07, + "loss": 3.2703, + "step": 150 + }, + { + "epoch": 0.03, + "grad_norm": 29.865960399124674, + "learning_rate": 8.781869688385269e-07, + "loss": 3.2143, + "step": 155 + }, + { + "epoch": 0.03, + "grad_norm": 25.246604944858476, + "learning_rate": 9.065155807365441e-07, + "loss": 3.1724, + "step": 160 + }, + { + "epoch": 0.03, + "grad_norm": 26.803614997802278, + "learning_rate": 9.34844192634561e-07, + "loss": 3.1165, + "step": 165 + }, + { + "epoch": 0.03, + "grad_norm": 24.868744499172813, + "learning_rate": 9.631728045325779e-07, + "loss": 3.0226, + "step": 170 + }, + { + "epoch": 0.03, + "grad_norm": 28.072404376457335, + "learning_rate": 9.91501416430595e-07, + "loss": 2.974, + "step": 175 + }, + { + "epoch": 0.03, + "grad_norm": 24.13981569942378, + "learning_rate": 1.019830028328612e-06, + "loss": 2.8986, + "step": 180 + }, + { + "epoch": 0.03, + "grad_norm": 24.682144066119207, + "learning_rate": 1.048158640226629e-06, + "loss": 2.8929, + "step": 185 + }, + { + "epoch": 0.03, + "grad_norm": 26.393633161733234, + "learning_rate": 1.0764872521246459e-06, + "loss": 2.8201, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 25.165888181101067, + "learning_rate": 1.104815864022663e-06, + "loss": 2.7715, + "step": 195 + }, + { + "epoch": 0.03, + "grad_norm": 24.497827155588414, + "learning_rate": 1.13314447592068e-06, + "loss": 2.7423, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 22.577170865053922, + "learning_rate": 1.1614730878186968e-06, + "loss": 2.6814, + "step": 205 + }, + { + "epoch": 0.04, + "grad_norm": 26.811068284154047, + "learning_rate": 1.1898016997167141e-06, + "loss": 2.681, + "step": 210 + }, + { + "epoch": 0.04, + "grad_norm": 23.625896687896763, + "learning_rate": 1.218130311614731e-06, + "loss": 2.6641, + "step": 215 + }, + { + "epoch": 0.04, + "grad_norm": 25.558263239239487, + "learning_rate": 1.2464589235127478e-06, + "loss": 2.6209, + "step": 220 + }, + { + "epoch": 0.04, + "grad_norm": 28.59558876783872, + "learning_rate": 1.274787535410765e-06, + "loss": 2.6035, + "step": 225 + }, + { + "epoch": 0.04, + "grad_norm": 22.99206869102384, + "learning_rate": 1.303116147308782e-06, + "loss": 2.5784, + "step": 230 + }, + { + "epoch": 0.04, + "grad_norm": 24.770207084904367, + "learning_rate": 1.331444759206799e-06, + "loss": 2.5403, + "step": 235 + }, + { + "epoch": 0.04, + "grad_norm": 26.943253914438525, + "learning_rate": 1.359773371104816e-06, + "loss": 2.4988, + "step": 240 + }, + { + "epoch": 0.04, + "grad_norm": 28.669675628003457, + "learning_rate": 1.388101983002833e-06, + "loss": 2.4725, + "step": 245 + }, + { + "epoch": 0.04, + "grad_norm": 32.51240263679756, + "learning_rate": 1.41643059490085e-06, + "loss": 2.4197, + "step": 250 + }, + { + "epoch": 0.04, + "grad_norm": 24.03710460536725, + "learning_rate": 1.444759206798867e-06, + "loss": 2.4541, + "step": 255 + }, + { + "epoch": 0.04, + "grad_norm": 30.538221432223352, + "learning_rate": 1.4730878186968839e-06, + "loss": 2.375, + "step": 260 + }, + { + "epoch": 0.05, + "grad_norm": 27.4383780573818, + "learning_rate": 1.5014164305949011e-06, + "loss": 2.4054, + "step": 265 + }, + { + "epoch": 0.05, + "grad_norm": 38.83226342367871, + "learning_rate": 1.529745042492918e-06, + "loss": 2.4019, + "step": 270 + }, + { + "epoch": 0.05, + "grad_norm": 22.04189249067676, + "learning_rate": 1.558073654390935e-06, + "loss": 2.3203, + "step": 275 + }, + { + "epoch": 0.05, + "grad_norm": 26.17958111536244, + "learning_rate": 1.5864022662889519e-06, + "loss": 2.3432, + "step": 280 + }, + { + "epoch": 0.05, + "grad_norm": 22.95410575681228, + "learning_rate": 1.614730878186969e-06, + "loss": 2.3145, + "step": 285 + }, + { + "epoch": 0.05, + "grad_norm": 23.93998688099864, + "learning_rate": 1.6430594900849862e-06, + "loss": 2.26, + "step": 290 + }, + { + "epoch": 0.05, + "grad_norm": 21.32107695775869, + "learning_rate": 1.671388101983003e-06, + "loss": 2.2498, + "step": 295 + }, + { + "epoch": 0.05, + "grad_norm": 24.70050367023582, + "learning_rate": 1.69971671388102e-06, + "loss": 2.2334, + "step": 300 + }, + { + "epoch": 0.05, + "grad_norm": 48.55117360385103, + "learning_rate": 1.728045325779037e-06, + "loss": 2.2476, + "step": 305 + }, + { + "epoch": 0.05, + "grad_norm": 27.055278892184113, + "learning_rate": 1.7563739376770538e-06, + "loss": 2.2039, + "step": 310 + }, + { + "epoch": 0.05, + "grad_norm": 19.84999881904028, + "learning_rate": 1.7847025495750709e-06, + "loss": 2.1813, + "step": 315 + }, + { + "epoch": 0.05, + "grad_norm": 40.40413661716398, + "learning_rate": 1.8130311614730881e-06, + "loss": 2.2304, + "step": 320 + }, + { + "epoch": 0.06, + "grad_norm": 32.46690888295986, + "learning_rate": 1.841359773371105e-06, + "loss": 2.2263, + "step": 325 + }, + { + "epoch": 0.06, + "grad_norm": 20.51738634970082, + "learning_rate": 1.869688385269122e-06, + "loss": 2.1234, + "step": 330 + }, + { + "epoch": 0.06, + "grad_norm": 35.222336018483524, + "learning_rate": 1.8980169971671389e-06, + "loss": 2.1573, + "step": 335 + }, + { + "epoch": 0.06, + "grad_norm": 37.7368422922636, + "learning_rate": 1.9263456090651557e-06, + "loss": 2.1452, + "step": 340 + }, + { + "epoch": 0.06, + "grad_norm": 21.56463831517406, + "learning_rate": 1.954674220963173e-06, + "loss": 2.128, + "step": 345 + }, + { + "epoch": 0.06, + "grad_norm": 23.972624335060562, + "learning_rate": 1.98300283286119e-06, + "loss": 2.1398, + "step": 350 + }, + { + "epoch": 0.06, + "grad_norm": 20.213325978350014, + "learning_rate": 2.011331444759207e-06, + "loss": 2.0917, + "step": 355 + }, + { + "epoch": 0.06, + "grad_norm": 32.978422655923445, + "learning_rate": 2.039660056657224e-06, + "loss": 2.0906, + "step": 360 + }, + { + "epoch": 0.06, + "grad_norm": 24.072172345830047, + "learning_rate": 2.067988668555241e-06, + "loss": 2.073, + "step": 365 + }, + { + "epoch": 0.06, + "grad_norm": 18.862216422542378, + "learning_rate": 2.096317280453258e-06, + "loss": 2.0942, + "step": 370 + }, + { + "epoch": 0.06, + "grad_norm": 22.782579148305224, + "learning_rate": 2.124645892351275e-06, + "loss": 2.0978, + "step": 375 + }, + { + "epoch": 0.06, + "grad_norm": 27.64608563553747, + "learning_rate": 2.1529745042492918e-06, + "loss": 2.0611, + "step": 380 + }, + { + "epoch": 0.07, + "grad_norm": 25.729919004188048, + "learning_rate": 2.181303116147309e-06, + "loss": 2.047, + "step": 385 + }, + { + "epoch": 0.07, + "grad_norm": 19.56297139440245, + "learning_rate": 2.209631728045326e-06, + "loss": 2.0339, + "step": 390 + }, + { + "epoch": 0.07, + "grad_norm": 19.52987241347125, + "learning_rate": 2.237960339943343e-06, + "loss": 2.0449, + "step": 395 + }, + { + "epoch": 0.07, + "grad_norm": 21.353166316715352, + "learning_rate": 2.26628895184136e-06, + "loss": 2.0223, + "step": 400 + }, + { + "epoch": 0.07, + "grad_norm": 18.933589191705828, + "learning_rate": 2.294617563739377e-06, + "loss": 2.0439, + "step": 405 + }, + { + "epoch": 0.07, + "grad_norm": 21.840396501445202, + "learning_rate": 2.3229461756373937e-06, + "loss": 2.0128, + "step": 410 + }, + { + "epoch": 0.07, + "grad_norm": 38.57601116485757, + "learning_rate": 2.3512747875354108e-06, + "loss": 2.0437, + "step": 415 + }, + { + "epoch": 0.07, + "grad_norm": 41.85780844791552, + "learning_rate": 2.3796033994334282e-06, + "loss": 1.9994, + "step": 420 + }, + { + "epoch": 0.07, + "grad_norm": 35.48875979085677, + "learning_rate": 2.407932011331445e-06, + "loss": 2.0033, + "step": 425 + }, + { + "epoch": 0.07, + "grad_norm": 43.38637323829772, + "learning_rate": 2.436260623229462e-06, + "loss": 2.005, + "step": 430 + }, + { + "epoch": 0.07, + "grad_norm": 19.900163750049977, + "learning_rate": 2.464589235127479e-06, + "loss": 1.9753, + "step": 435 + }, + { + "epoch": 0.07, + "grad_norm": 22.204455228895014, + "learning_rate": 2.4929178470254956e-06, + "loss": 1.9557, + "step": 440 + }, + { + "epoch": 0.08, + "grad_norm": 17.74787616134596, + "learning_rate": 2.521246458923513e-06, + "loss": 1.9367, + "step": 445 + }, + { + "epoch": 0.08, + "grad_norm": 21.03296681245724, + "learning_rate": 2.54957507082153e-06, + "loss": 1.9577, + "step": 450 + }, + { + "epoch": 0.08, + "grad_norm": 23.070555798078754, + "learning_rate": 2.577903682719547e-06, + "loss": 1.9425, + "step": 455 + }, + { + "epoch": 0.08, + "grad_norm": 18.095413622860136, + "learning_rate": 2.606232294617564e-06, + "loss": 1.9424, + "step": 460 + }, + { + "epoch": 0.08, + "grad_norm": 37.1892575296801, + "learning_rate": 2.634560906515581e-06, + "loss": 1.941, + "step": 465 + }, + { + "epoch": 0.08, + "grad_norm": 30.687534445988742, + "learning_rate": 2.662889518413598e-06, + "loss": 1.9047, + "step": 470 + }, + { + "epoch": 0.08, + "grad_norm": 22.424833044102115, + "learning_rate": 2.6912181303116146e-06, + "loss": 1.9292, + "step": 475 + }, + { + "epoch": 0.08, + "grad_norm": 27.360616215950174, + "learning_rate": 2.719546742209632e-06, + "loss": 1.9009, + "step": 480 + }, + { + "epoch": 0.08, + "grad_norm": 40.15039463581183, + "learning_rate": 2.747875354107649e-06, + "loss": 1.9329, + "step": 485 + }, + { + "epoch": 0.08, + "grad_norm": 22.5561162952693, + "learning_rate": 2.776203966005666e-06, + "loss": 1.8924, + "step": 490 + }, + { + "epoch": 0.08, + "grad_norm": 20.21398000813242, + "learning_rate": 2.804532577903683e-06, + "loss": 1.8809, + "step": 495 + }, + { + "epoch": 0.09, + "grad_norm": 32.098925635303246, + "learning_rate": 2.8328611898017e-06, + "loss": 1.9138, + "step": 500 + }, + { + "epoch": 0.09, + "grad_norm": 42.05244685784008, + "learning_rate": 2.861189801699717e-06, + "loss": 1.8856, + "step": 505 + }, + { + "epoch": 0.09, + "grad_norm": 25.005208485231954, + "learning_rate": 2.889518413597734e-06, + "loss": 1.841, + "step": 510 + }, + { + "epoch": 0.09, + "grad_norm": 28.615268279304342, + "learning_rate": 2.9178470254957506e-06, + "loss": 1.8342, + "step": 515 + }, + { + "epoch": 0.09, + "grad_norm": 36.647396681534886, + "learning_rate": 2.9461756373937677e-06, + "loss": 1.8696, + "step": 520 + }, + { + "epoch": 0.09, + "grad_norm": 52.74806377562607, + "learning_rate": 2.9745042492917848e-06, + "loss": 1.8655, + "step": 525 + }, + { + "epoch": 0.09, + "grad_norm": 35.82795457958816, + "learning_rate": 3.0028328611898022e-06, + "loss": 1.8683, + "step": 530 + }, + { + "epoch": 0.09, + "grad_norm": 29.29875595295583, + "learning_rate": 3.0311614730878193e-06, + "loss": 1.8225, + "step": 535 + }, + { + "epoch": 0.09, + "grad_norm": 31.058050277581376, + "learning_rate": 3.059490084985836e-06, + "loss": 1.8488, + "step": 540 + }, + { + "epoch": 0.09, + "grad_norm": 23.73823300434112, + "learning_rate": 3.087818696883853e-06, + "loss": 1.8479, + "step": 545 + }, + { + "epoch": 0.09, + "grad_norm": 19.57654252274149, + "learning_rate": 3.11614730878187e-06, + "loss": 1.8151, + "step": 550 + }, + { + "epoch": 0.09, + "grad_norm": 17.0532264932674, + "learning_rate": 3.144475920679887e-06, + "loss": 1.8559, + "step": 555 + }, + { + "epoch": 0.1, + "grad_norm": 17.48136081087222, + "learning_rate": 3.1728045325779038e-06, + "loss": 1.7867, + "step": 560 + }, + { + "epoch": 0.1, + "grad_norm": 20.273223063451194, + "learning_rate": 3.201133144475921e-06, + "loss": 1.8184, + "step": 565 + }, + { + "epoch": 0.1, + "grad_norm": 22.305551298403266, + "learning_rate": 3.229461756373938e-06, + "loss": 1.792, + "step": 570 + }, + { + "epoch": 0.1, + "grad_norm": 51.38398971351818, + "learning_rate": 3.257790368271955e-06, + "loss": 1.8154, + "step": 575 + }, + { + "epoch": 0.1, + "grad_norm": 49.786223800122116, + "learning_rate": 3.2861189801699724e-06, + "loss": 1.8044, + "step": 580 + }, + { + "epoch": 0.1, + "grad_norm": 47.56380663604067, + "learning_rate": 3.314447592067989e-06, + "loss": 1.8327, + "step": 585 + }, + { + "epoch": 0.1, + "grad_norm": 44.47524479659074, + "learning_rate": 3.342776203966006e-06, + "loss": 1.8105, + "step": 590 + }, + { + "epoch": 0.1, + "grad_norm": 33.604777311500975, + "learning_rate": 3.371104815864023e-06, + "loss": 1.7709, + "step": 595 + }, + { + "epoch": 0.1, + "grad_norm": 28.320348049863327, + "learning_rate": 3.39943342776204e-06, + "loss": 1.7832, + "step": 600 + }, + { + "epoch": 0.1, + "grad_norm": 27.15094992711375, + "learning_rate": 3.427762039660057e-06, + "loss": 1.7769, + "step": 605 + }, + { + "epoch": 0.1, + "grad_norm": 18.163403461295847, + "learning_rate": 3.456090651558074e-06, + "loss": 1.7995, + "step": 610 + }, + { + "epoch": 0.1, + "grad_norm": 27.076996397342533, + "learning_rate": 3.484419263456091e-06, + "loss": 1.7721, + "step": 615 + }, + { + "epoch": 0.11, + "grad_norm": 20.08583043568208, + "learning_rate": 3.5127478753541076e-06, + "loss": 1.7494, + "step": 620 + }, + { + "epoch": 0.11, + "grad_norm": 34.60488151573862, + "learning_rate": 3.5410764872521247e-06, + "loss": 1.7875, + "step": 625 + }, + { + "epoch": 0.11, + "grad_norm": 19.583304295440154, + "learning_rate": 3.5694050991501417e-06, + "loss": 1.7533, + "step": 630 + }, + { + "epoch": 0.11, + "grad_norm": 42.29524810809757, + "learning_rate": 3.597733711048159e-06, + "loss": 1.7644, + "step": 635 + }, + { + "epoch": 0.11, + "grad_norm": 66.58393026110944, + "learning_rate": 3.6260623229461763e-06, + "loss": 1.7385, + "step": 640 + }, + { + "epoch": 0.11, + "grad_norm": 31.076394544761435, + "learning_rate": 3.654390934844193e-06, + "loss": 1.742, + "step": 645 + }, + { + "epoch": 0.11, + "grad_norm": 18.91941602854681, + "learning_rate": 3.68271954674221e-06, + "loss": 1.7907, + "step": 650 + }, + { + "epoch": 0.11, + "grad_norm": 20.48663060720521, + "learning_rate": 3.711048158640227e-06, + "loss": 1.772, + "step": 655 + }, + { + "epoch": 0.11, + "grad_norm": 21.169491370728753, + "learning_rate": 3.739376770538244e-06, + "loss": 1.7613, + "step": 660 + }, + { + "epoch": 0.11, + "grad_norm": 19.394119690737643, + "learning_rate": 3.7677053824362607e-06, + "loss": 1.7357, + "step": 665 + }, + { + "epoch": 0.11, + "grad_norm": 36.97000360839211, + "learning_rate": 3.7960339943342778e-06, + "loss": 1.7568, + "step": 670 + }, + { + "epoch": 0.11, + "grad_norm": 41.170536011729254, + "learning_rate": 3.824362606232295e-06, + "loss": 1.7878, + "step": 675 + }, + { + "epoch": 0.12, + "grad_norm": 18.91271833985899, + "learning_rate": 3.8526912181303115e-06, + "loss": 1.735, + "step": 680 + }, + { + "epoch": 0.12, + "grad_norm": 15.177589104249861, + "learning_rate": 3.881019830028329e-06, + "loss": 1.7479, + "step": 685 + }, + { + "epoch": 0.12, + "grad_norm": 53.90923998912517, + "learning_rate": 3.909348441926346e-06, + "loss": 1.7264, + "step": 690 + }, + { + "epoch": 0.12, + "grad_norm": 79.43302487475911, + "learning_rate": 3.937677053824363e-06, + "loss": 1.6986, + "step": 695 + }, + { + "epoch": 0.12, + "grad_norm": 20.1848106409289, + "learning_rate": 3.96600566572238e-06, + "loss": 1.7403, + "step": 700 + }, + { + "epoch": 0.12, + "grad_norm": 18.977411772592603, + "learning_rate": 3.994334277620397e-06, + "loss": 1.729, + "step": 705 + }, + { + "epoch": 0.12, + "grad_norm": 19.60858292804487, + "learning_rate": 4.022662889518414e-06, + "loss": 1.7391, + "step": 710 + }, + { + "epoch": 0.12, + "grad_norm": 25.667932494840965, + "learning_rate": 4.0509915014164304e-06, + "loss": 1.7108, + "step": 715 + }, + { + "epoch": 0.12, + "grad_norm": 20.78540678599752, + "learning_rate": 4.079320113314448e-06, + "loss": 1.7221, + "step": 720 + }, + { + "epoch": 0.12, + "grad_norm": 76.42698172331296, + "learning_rate": 4.1076487252124646e-06, + "loss": 1.7105, + "step": 725 + }, + { + "epoch": 0.12, + "grad_norm": 71.16243308160482, + "learning_rate": 4.135977337110482e-06, + "loss": 1.727, + "step": 730 + }, + { + "epoch": 0.12, + "grad_norm": 17.40654783870013, + "learning_rate": 4.1643059490084995e-06, + "loss": 1.6936, + "step": 735 + }, + { + "epoch": 0.13, + "grad_norm": 34.693252022614544, + "learning_rate": 4.192634560906516e-06, + "loss": 1.6621, + "step": 740 + }, + { + "epoch": 0.13, + "grad_norm": 38.53652676282218, + "learning_rate": 4.220963172804533e-06, + "loss": 1.6904, + "step": 745 + }, + { + "epoch": 0.13, + "grad_norm": 26.83929663571352, + "learning_rate": 4.24929178470255e-06, + "loss": 1.6837, + "step": 750 + }, + { + "epoch": 0.13, + "grad_norm": 45.81507221553908, + "learning_rate": 4.277620396600567e-06, + "loss": 1.6989, + "step": 755 + }, + { + "epoch": 0.13, + "grad_norm": 33.692133005547205, + "learning_rate": 4.3059490084985835e-06, + "loss": 1.6386, + "step": 760 + }, + { + "epoch": 0.13, + "grad_norm": 19.38330427723925, + "learning_rate": 4.334277620396601e-06, + "loss": 1.6653, + "step": 765 + }, + { + "epoch": 0.13, + "grad_norm": 30.678965153137693, + "learning_rate": 4.362606232294618e-06, + "loss": 1.6807, + "step": 770 + }, + { + "epoch": 0.13, + "grad_norm": 24.165456765295332, + "learning_rate": 4.390934844192635e-06, + "loss": 1.6987, + "step": 775 + }, + { + "epoch": 0.13, + "grad_norm": 32.29327910090702, + "learning_rate": 4.419263456090652e-06, + "loss": 1.6802, + "step": 780 + }, + { + "epoch": 0.13, + "grad_norm": 21.16971480892796, + "learning_rate": 4.447592067988669e-06, + "loss": 1.6497, + "step": 785 + }, + { + "epoch": 0.13, + "grad_norm": 26.707784735238036, + "learning_rate": 4.475920679886686e-06, + "loss": 1.6604, + "step": 790 + }, + { + "epoch": 0.14, + "grad_norm": 50.87756098034557, + "learning_rate": 4.504249291784703e-06, + "loss": 1.6326, + "step": 795 + }, + { + "epoch": 0.14, + "grad_norm": 52.23774283052041, + "learning_rate": 4.53257790368272e-06, + "loss": 1.6485, + "step": 800 + }, + { + "epoch": 0.14, + "grad_norm": 63.05813173390926, + "learning_rate": 4.560906515580737e-06, + "loss": 1.6834, + "step": 805 + }, + { + "epoch": 0.14, + "grad_norm": 22.075712655510806, + "learning_rate": 4.589235127478754e-06, + "loss": 1.6382, + "step": 810 + }, + { + "epoch": 0.14, + "grad_norm": 29.09471283050641, + "learning_rate": 4.617563739376771e-06, + "loss": 1.6572, + "step": 815 + }, + { + "epoch": 0.14, + "grad_norm": 43.606834380034115, + "learning_rate": 4.645892351274787e-06, + "loss": 1.6566, + "step": 820 + }, + { + "epoch": 0.14, + "grad_norm": 49.418261757308244, + "learning_rate": 4.674220963172805e-06, + "loss": 1.6501, + "step": 825 + }, + { + "epoch": 0.14, + "grad_norm": 60.58000269353636, + "learning_rate": 4.7025495750708215e-06, + "loss": 1.6284, + "step": 830 + }, + { + "epoch": 0.14, + "grad_norm": 19.768535580269724, + "learning_rate": 4.730878186968839e-06, + "loss": 1.6342, + "step": 835 + }, + { + "epoch": 0.14, + "grad_norm": 25.27132812623081, + "learning_rate": 4.7592067988668565e-06, + "loss": 1.6236, + "step": 840 + }, + { + "epoch": 0.14, + "grad_norm": 17.37282219238344, + "learning_rate": 4.787535410764873e-06, + "loss": 1.6277, + "step": 845 + }, + { + "epoch": 0.14, + "grad_norm": 46.068808485877284, + "learning_rate": 4.81586402266289e-06, + "loss": 1.6378, + "step": 850 + }, + { + "epoch": 0.15, + "grad_norm": 64.72578782324423, + "learning_rate": 4.844192634560907e-06, + "loss": 1.6242, + "step": 855 + }, + { + "epoch": 0.15, + "grad_norm": 36.406125553208405, + "learning_rate": 4.872521246458924e-06, + "loss": 1.625, + "step": 860 + }, + { + "epoch": 0.15, + "grad_norm": 19.157180999051647, + "learning_rate": 4.9008498583569405e-06, + "loss": 1.6419, + "step": 865 + }, + { + "epoch": 0.15, + "grad_norm": 31.267783331002605, + "learning_rate": 4.929178470254958e-06, + "loss": 1.6156, + "step": 870 + }, + { + "epoch": 0.15, + "grad_norm": 30.51950308301667, + "learning_rate": 4.957507082152975e-06, + "loss": 1.6127, + "step": 875 + }, + { + "epoch": 0.15, + "grad_norm": 59.65679660528968, + "learning_rate": 4.985835694050991e-06, + "loss": 1.5823, + "step": 880 + }, + { + "epoch": 0.15, + "grad_norm": 33.09588742022726, + "learning_rate": 5.014164305949009e-06, + "loss": 1.6119, + "step": 885 + }, + { + "epoch": 0.15, + "grad_norm": 17.46276943511441, + "learning_rate": 5.042492917847026e-06, + "loss": 1.5669, + "step": 890 + }, + { + "epoch": 0.15, + "grad_norm": 27.95986512759189, + "learning_rate": 5.070821529745043e-06, + "loss": 1.6021, + "step": 895 + }, + { + "epoch": 0.15, + "grad_norm": 40.53798724973572, + "learning_rate": 5.09915014164306e-06, + "loss": 1.5952, + "step": 900 + }, + { + "epoch": 0.15, + "grad_norm": 98.13247181294633, + "learning_rate": 5.127478753541076e-06, + "loss": 1.6185, + "step": 905 + }, + { + "epoch": 0.15, + "grad_norm": 52.03314426839591, + "learning_rate": 5.155807365439094e-06, + "loss": 1.6101, + "step": 910 + }, + { + "epoch": 0.16, + "grad_norm": 64.46486065152293, + "learning_rate": 5.184135977337111e-06, + "loss": 1.5932, + "step": 915 + }, + { + "epoch": 0.16, + "grad_norm": 55.85131329674641, + "learning_rate": 5.212464589235128e-06, + "loss": 1.5597, + "step": 920 + }, + { + "epoch": 0.16, + "grad_norm": 104.94386787582185, + "learning_rate": 5.240793201133145e-06, + "loss": 1.6062, + "step": 925 + }, + { + "epoch": 0.16, + "grad_norm": 55.90312620104168, + "learning_rate": 5.269121813031162e-06, + "loss": 1.6001, + "step": 930 + }, + { + "epoch": 0.16, + "grad_norm": 36.37767655193305, + "learning_rate": 5.297450424929179e-06, + "loss": 1.5993, + "step": 935 + }, + { + "epoch": 0.16, + "grad_norm": 39.26128766918641, + "learning_rate": 5.325779036827196e-06, + "loss": 1.5651, + "step": 940 + }, + { + "epoch": 0.16, + "grad_norm": 43.57675317461804, + "learning_rate": 5.3541076487252134e-06, + "loss": 1.5913, + "step": 945 + }, + { + "epoch": 0.16, + "grad_norm": 17.26426628907133, + "learning_rate": 5.382436260623229e-06, + "loss": 1.594, + "step": 950 + }, + { + "epoch": 0.16, + "grad_norm": 18.037423782357767, + "learning_rate": 5.410764872521247e-06, + "loss": 1.5744, + "step": 955 + }, + { + "epoch": 0.16, + "grad_norm": 130.6445045404753, + "learning_rate": 5.439093484419264e-06, + "loss": 1.6486, + "step": 960 + }, + { + "epoch": 0.16, + "grad_norm": 54.45429682389578, + "learning_rate": 5.467422096317281e-06, + "loss": 1.5734, + "step": 965 + }, + { + "epoch": 0.16, + "grad_norm": 42.69622349959172, + "learning_rate": 5.495750708215298e-06, + "loss": 1.6064, + "step": 970 + }, + { + "epoch": 0.17, + "grad_norm": 59.71378250423668, + "learning_rate": 5.524079320113315e-06, + "loss": 1.6193, + "step": 975 + }, + { + "epoch": 0.17, + "grad_norm": 62.55182445293084, + "learning_rate": 5.552407932011332e-06, + "loss": 1.5776, + "step": 980 + }, + { + "epoch": 0.17, + "grad_norm": 59.05878711283353, + "learning_rate": 5.580736543909348e-06, + "loss": 1.5794, + "step": 985 + }, + { + "epoch": 0.17, + "grad_norm": 28.560889477217298, + "learning_rate": 5.609065155807366e-06, + "loss": 1.5593, + "step": 990 + }, + { + "epoch": 0.17, + "grad_norm": 27.490073642677785, + "learning_rate": 5.637393767705382e-06, + "loss": 1.5749, + "step": 995 + }, + { + "epoch": 0.17, + "grad_norm": 15.950065631003085, + "learning_rate": 5.6657223796034e-06, + "loss": 1.5537, + "step": 1000 + }, + { + "epoch": 0.17, + "grad_norm": 43.35553144079011, + "learning_rate": 5.6940509915014164e-06, + "loss": 1.5762, + "step": 1005 + }, + { + "epoch": 0.17, + "grad_norm": 28.678343487108393, + "learning_rate": 5.722379603399434e-06, + "loss": 1.5324, + "step": 1010 + }, + { + "epoch": 0.17, + "grad_norm": 98.86163137432244, + "learning_rate": 5.750708215297451e-06, + "loss": 1.5333, + "step": 1015 + }, + { + "epoch": 0.17, + "grad_norm": 63.299358135948566, + "learning_rate": 5.779036827195468e-06, + "loss": 1.542, + "step": 1020 + }, + { + "epoch": 0.17, + "grad_norm": 19.56384405996424, + "learning_rate": 5.8073654390934855e-06, + "loss": 1.5661, + "step": 1025 + }, + { + "epoch": 0.18, + "grad_norm": 27.414836270647065, + "learning_rate": 5.835694050991501e-06, + "loss": 1.564, + "step": 1030 + }, + { + "epoch": 0.18, + "grad_norm": 20.974654931798312, + "learning_rate": 5.864022662889519e-06, + "loss": 1.5267, + "step": 1035 + }, + { + "epoch": 0.18, + "grad_norm": 15.14250216808397, + "learning_rate": 5.892351274787535e-06, + "loss": 1.5485, + "step": 1040 + }, + { + "epoch": 0.18, + "grad_norm": 19.483781002585005, + "learning_rate": 5.920679886685553e-06, + "loss": 1.5312, + "step": 1045 + }, + { + "epoch": 0.18, + "grad_norm": 19.977930403123093, + "learning_rate": 5.9490084985835695e-06, + "loss": 1.5574, + "step": 1050 + }, + { + "epoch": 0.18, + "grad_norm": 35.868563902903155, + "learning_rate": 5.977337110481587e-06, + "loss": 1.5089, + "step": 1055 + }, + { + "epoch": 0.18, + "grad_norm": 35.70653189378658, + "learning_rate": 6.0056657223796045e-06, + "loss": 1.4986, + "step": 1060 + }, + { + "epoch": 0.18, + "grad_norm": 83.85460986190164, + "learning_rate": 6.033994334277621e-06, + "loss": 1.5424, + "step": 1065 + }, + { + "epoch": 0.18, + "grad_norm": 52.01364066249039, + "learning_rate": 6.062322946175639e-06, + "loss": 1.5451, + "step": 1070 + }, + { + "epoch": 0.18, + "grad_norm": 41.4992437255658, + "learning_rate": 6.090651558073654e-06, + "loss": 1.5275, + "step": 1075 + }, + { + "epoch": 0.18, + "grad_norm": 30.925485991933222, + "learning_rate": 6.118980169971672e-06, + "loss": 1.5082, + "step": 1080 + }, + { + "epoch": 0.18, + "grad_norm": 44.720242480528114, + "learning_rate": 6.1473087818696885e-06, + "loss": 1.5481, + "step": 1085 + }, + { + "epoch": 0.19, + "grad_norm": 53.91170639021118, + "learning_rate": 6.175637393767706e-06, + "loss": 1.5137, + "step": 1090 + }, + { + "epoch": 0.19, + "grad_norm": 78.19398697511646, + "learning_rate": 6.203966005665723e-06, + "loss": 1.5195, + "step": 1095 + }, + { + "epoch": 0.19, + "grad_norm": 39.94674555010526, + "learning_rate": 6.23229461756374e-06, + "loss": 1.5273, + "step": 1100 + }, + { + "epoch": 0.19, + "grad_norm": 17.855101814171903, + "learning_rate": 6.260623229461757e-06, + "loss": 1.5309, + "step": 1105 + }, + { + "epoch": 0.19, + "grad_norm": 44.867726205072024, + "learning_rate": 6.288951841359774e-06, + "loss": 1.5242, + "step": 1110 + }, + { + "epoch": 0.19, + "grad_norm": 29.131859743656516, + "learning_rate": 6.317280453257792e-06, + "loss": 1.5273, + "step": 1115 + }, + { + "epoch": 0.19, + "grad_norm": 17.611183595956476, + "learning_rate": 6.3456090651558075e-06, + "loss": 1.5011, + "step": 1120 + }, + { + "epoch": 0.19, + "grad_norm": 53.35713779003586, + "learning_rate": 6.373937677053825e-06, + "loss": 1.4962, + "step": 1125 + }, + { + "epoch": 0.19, + "grad_norm": 44.931444706391076, + "learning_rate": 6.402266288951842e-06, + "loss": 1.511, + "step": 1130 + }, + { + "epoch": 0.19, + "grad_norm": 22.44919456560898, + "learning_rate": 6.430594900849859e-06, + "loss": 1.5138, + "step": 1135 + }, + { + "epoch": 0.19, + "grad_norm": 45.08279033177284, + "learning_rate": 6.458923512747876e-06, + "loss": 1.4963, + "step": 1140 + }, + { + "epoch": 0.19, + "grad_norm": 17.52813913674402, + "learning_rate": 6.487252124645893e-06, + "loss": 1.5131, + "step": 1145 + }, + { + "epoch": 0.2, + "grad_norm": 26.26804383614154, + "learning_rate": 6.51558073654391e-06, + "loss": 1.5118, + "step": 1150 + }, + { + "epoch": 0.2, + "grad_norm": 38.38984295804615, + "learning_rate": 6.543909348441927e-06, + "loss": 1.5052, + "step": 1155 + }, + { + "epoch": 0.2, + "grad_norm": 62.89163803243198, + "learning_rate": 6.572237960339945e-06, + "loss": 1.4908, + "step": 1160 + }, + { + "epoch": 0.2, + "grad_norm": 16.43464761001089, + "learning_rate": 6.600566572237961e-06, + "loss": 1.4863, + "step": 1165 + }, + { + "epoch": 0.2, + "grad_norm": 27.171308819606647, + "learning_rate": 6.628895184135978e-06, + "loss": 1.4866, + "step": 1170 + }, + { + "epoch": 0.2, + "grad_norm": 12.858618384000161, + "learning_rate": 6.657223796033995e-06, + "loss": 1.4997, + "step": 1175 + }, + { + "epoch": 0.2, + "grad_norm": 26.73384894098579, + "learning_rate": 6.685552407932012e-06, + "loss": 1.4724, + "step": 1180 + }, + { + "epoch": 0.2, + "grad_norm": 21.318889353798717, + "learning_rate": 6.713881019830029e-06, + "loss": 1.4655, + "step": 1185 + }, + { + "epoch": 0.2, + "grad_norm": 47.36154668479138, + "learning_rate": 6.742209631728046e-06, + "loss": 1.4677, + "step": 1190 + }, + { + "epoch": 0.2, + "grad_norm": 13.312162357247926, + "learning_rate": 6.770538243626062e-06, + "loss": 1.487, + "step": 1195 + }, + { + "epoch": 0.2, + "grad_norm": 28.90405260718444, + "learning_rate": 6.79886685552408e-06, + "loss": 1.4774, + "step": 1200 + }, + { + "epoch": 0.2, + "grad_norm": 43.20968543413434, + "learning_rate": 6.827195467422096e-06, + "loss": 1.4592, + "step": 1205 + }, + { + "epoch": 0.21, + "grad_norm": 15.41534301679069, + "learning_rate": 6.855524079320114e-06, + "loss": 1.4741, + "step": 1210 + }, + { + "epoch": 0.21, + "grad_norm": 39.00940865689802, + "learning_rate": 6.883852691218131e-06, + "loss": 1.476, + "step": 1215 + }, + { + "epoch": 0.21, + "grad_norm": 15.463235319018573, + "learning_rate": 6.912181303116148e-06, + "loss": 1.4989, + "step": 1220 + }, + { + "epoch": 0.21, + "grad_norm": 16.14813609317964, + "learning_rate": 6.940509915014165e-06, + "loss": 1.4772, + "step": 1225 + }, + { + "epoch": 0.21, + "grad_norm": 45.64032082296392, + "learning_rate": 6.968838526912182e-06, + "loss": 1.4859, + "step": 1230 + }, + { + "epoch": 0.21, + "grad_norm": 14.426783265938841, + "learning_rate": 6.997167138810199e-06, + "loss": 1.4732, + "step": 1235 + }, + { + "epoch": 0.21, + "grad_norm": 56.98259656288749, + "learning_rate": 7.025495750708215e-06, + "loss": 1.4717, + "step": 1240 + }, + { + "epoch": 0.21, + "grad_norm": 73.22843429393423, + "learning_rate": 7.053824362606233e-06, + "loss": 1.4471, + "step": 1245 + }, + { + "epoch": 0.21, + "grad_norm": 25.573775068067942, + "learning_rate": 7.082152974504249e-06, + "loss": 1.4372, + "step": 1250 + }, + { + "epoch": 0.21, + "grad_norm": 12.443033292868687, + "learning_rate": 7.110481586402267e-06, + "loss": 1.4631, + "step": 1255 + }, + { + "epoch": 0.21, + "grad_norm": 14.668450041884629, + "learning_rate": 7.1388101983002834e-06, + "loss": 1.4368, + "step": 1260 + }, + { + "epoch": 0.22, + "grad_norm": 115.98207102359974, + "learning_rate": 7.167138810198301e-06, + "loss": 1.468, + "step": 1265 + }, + { + "epoch": 0.22, + "grad_norm": 128.82110344511952, + "learning_rate": 7.195467422096318e-06, + "loss": 1.4917, + "step": 1270 + }, + { + "epoch": 0.22, + "grad_norm": 31.41157580165412, + "learning_rate": 7.223796033994335e-06, + "loss": 1.4766, + "step": 1275 + }, + { + "epoch": 0.22, + "grad_norm": 33.117081868197246, + "learning_rate": 7.2521246458923525e-06, + "loss": 1.4506, + "step": 1280 + }, + { + "epoch": 0.22, + "grad_norm": 33.6257926561511, + "learning_rate": 7.280453257790368e-06, + "loss": 1.4871, + "step": 1285 + }, + { + "epoch": 0.22, + "grad_norm": 16.68056867252948, + "learning_rate": 7.308781869688386e-06, + "loss": 1.4495, + "step": 1290 + }, + { + "epoch": 0.22, + "grad_norm": 76.32432258998954, + "learning_rate": 7.337110481586402e-06, + "loss": 1.4476, + "step": 1295 + }, + { + "epoch": 0.22, + "grad_norm": 73.51943278047416, + "learning_rate": 7.36543909348442e-06, + "loss": 1.4601, + "step": 1300 + }, + { + "epoch": 0.22, + "grad_norm": 35.92091908157174, + "learning_rate": 7.3937677053824365e-06, + "loss": 1.4612, + "step": 1305 + }, + { + "epoch": 0.22, + "grad_norm": 37.15133392906697, + "learning_rate": 7.422096317280454e-06, + "loss": 1.4608, + "step": 1310 + }, + { + "epoch": 0.22, + "grad_norm": 54.24887555884184, + "learning_rate": 7.4504249291784715e-06, + "loss": 1.4471, + "step": 1315 + }, + { + "epoch": 0.22, + "grad_norm": 66.61085194888463, + "learning_rate": 7.478753541076488e-06, + "loss": 1.4306, + "step": 1320 + }, + { + "epoch": 0.23, + "grad_norm": 67.55198055290484, + "learning_rate": 7.507082152974506e-06, + "loss": 1.4294, + "step": 1325 + }, + { + "epoch": 0.23, + "grad_norm": 60.83568936526317, + "learning_rate": 7.535410764872521e-06, + "loss": 1.4426, + "step": 1330 + }, + { + "epoch": 0.23, + "grad_norm": 64.35303855984785, + "learning_rate": 7.563739376770539e-06, + "loss": 1.4382, + "step": 1335 + }, + { + "epoch": 0.23, + "grad_norm": 56.55071526619856, + "learning_rate": 7.5920679886685555e-06, + "loss": 1.4224, + "step": 1340 + }, + { + "epoch": 0.23, + "grad_norm": 67.12284685088699, + "learning_rate": 7.620396600566573e-06, + "loss": 1.4321, + "step": 1345 + }, + { + "epoch": 0.23, + "grad_norm": 48.395017715798886, + "learning_rate": 7.64872521246459e-06, + "loss": 1.4346, + "step": 1350 + }, + { + "epoch": 0.23, + "grad_norm": 38.088435110493364, + "learning_rate": 7.677053824362606e-06, + "loss": 1.434, + "step": 1355 + }, + { + "epoch": 0.23, + "grad_norm": 15.108791280938446, + "learning_rate": 7.705382436260623e-06, + "loss": 1.4467, + "step": 1360 + }, + { + "epoch": 0.23, + "grad_norm": 46.213258435941164, + "learning_rate": 7.733711048158641e-06, + "loss": 1.4726, + "step": 1365 + }, + { + "epoch": 0.23, + "grad_norm": 31.36163401132065, + "learning_rate": 7.762039660056658e-06, + "loss": 1.417, + "step": 1370 + }, + { + "epoch": 0.23, + "grad_norm": 21.57521345294901, + "learning_rate": 7.790368271954675e-06, + "loss": 1.4167, + "step": 1375 + }, + { + "epoch": 0.23, + "grad_norm": 25.684711958358132, + "learning_rate": 7.818696883852693e-06, + "loss": 1.4279, + "step": 1380 + }, + { + "epoch": 0.24, + "grad_norm": 46.14025445942214, + "learning_rate": 7.847025495750708e-06, + "loss": 1.4367, + "step": 1385 + }, + { + "epoch": 0.24, + "grad_norm": 16.837822707093785, + "learning_rate": 7.875354107648726e-06, + "loss": 1.4414, + "step": 1390 + }, + { + "epoch": 0.24, + "grad_norm": 25.32494554542278, + "learning_rate": 7.903682719546743e-06, + "loss": 1.4225, + "step": 1395 + }, + { + "epoch": 0.24, + "grad_norm": 22.096692157328395, + "learning_rate": 7.93201133144476e-06, + "loss": 1.4244, + "step": 1400 + }, + { + "epoch": 0.24, + "grad_norm": 23.085791295961783, + "learning_rate": 7.960339943342776e-06, + "loss": 1.4269, + "step": 1405 + }, + { + "epoch": 0.24, + "grad_norm": 42.63437473811217, + "learning_rate": 7.988668555240794e-06, + "loss": 1.4216, + "step": 1410 + }, + { + "epoch": 0.24, + "grad_norm": 14.581831841044835, + "learning_rate": 8.016997167138811e-06, + "loss": 1.4241, + "step": 1415 + }, + { + "epoch": 0.24, + "grad_norm": 107.73424906023473, + "learning_rate": 8.045325779036828e-06, + "loss": 1.4305, + "step": 1420 + }, + { + "epoch": 0.24, + "grad_norm": 102.80669799826326, + "learning_rate": 8.073654390934846e-06, + "loss": 1.4035, + "step": 1425 + }, + { + "epoch": 0.24, + "grad_norm": 54.25200717068274, + "learning_rate": 8.101983002832861e-06, + "loss": 1.4146, + "step": 1430 + }, + { + "epoch": 0.24, + "grad_norm": 70.28692545400818, + "learning_rate": 8.13031161473088e-06, + "loss": 1.4386, + "step": 1435 + }, + { + "epoch": 0.24, + "grad_norm": 41.22226556175442, + "learning_rate": 8.158640226628896e-06, + "loss": 1.4142, + "step": 1440 + }, + { + "epoch": 0.25, + "grad_norm": 33.320500906922966, + "learning_rate": 8.186968838526912e-06, + "loss": 1.4063, + "step": 1445 + }, + { + "epoch": 0.25, + "grad_norm": 20.96946984665567, + "learning_rate": 8.215297450424929e-06, + "loss": 1.3873, + "step": 1450 + }, + { + "epoch": 0.25, + "grad_norm": 36.4133779105684, + "learning_rate": 8.243626062322947e-06, + "loss": 1.3998, + "step": 1455 + }, + { + "epoch": 0.25, + "grad_norm": 19.19446044964224, + "learning_rate": 8.271954674220964e-06, + "loss": 1.4123, + "step": 1460 + }, + { + "epoch": 0.25, + "grad_norm": 21.008839218394122, + "learning_rate": 8.30028328611898e-06, + "loss": 1.3916, + "step": 1465 + }, + { + "epoch": 0.25, + "grad_norm": 20.51211979323123, + "learning_rate": 8.328611898016999e-06, + "loss": 1.4045, + "step": 1470 + }, + { + "epoch": 0.25, + "grad_norm": 28.355806563412795, + "learning_rate": 8.356940509915014e-06, + "loss": 1.3989, + "step": 1475 + }, + { + "epoch": 0.25, + "grad_norm": 56.645997417423445, + "learning_rate": 8.385269121813032e-06, + "loss": 1.3892, + "step": 1480 + }, + { + "epoch": 0.25, + "grad_norm": 37.58733117483026, + "learning_rate": 8.413597733711049e-06, + "loss": 1.3983, + "step": 1485 + }, + { + "epoch": 0.25, + "grad_norm": 23.45675645372948, + "learning_rate": 8.441926345609066e-06, + "loss": 1.4338, + "step": 1490 + }, + { + "epoch": 0.25, + "grad_norm": 43.27659750878071, + "learning_rate": 8.470254957507082e-06, + "loss": 1.4018, + "step": 1495 + }, + { + "epoch": 0.26, + "grad_norm": 26.338070551642947, + "learning_rate": 8.4985835694051e-06, + "loss": 1.3832, + "step": 1500 + }, + { + "epoch": 0.26, + "grad_norm": 35.56121674532834, + "learning_rate": 8.526912181303117e-06, + "loss": 1.3637, + "step": 1505 + }, + { + "epoch": 0.26, + "grad_norm": 13.858156584818255, + "learning_rate": 8.555240793201134e-06, + "loss": 1.3929, + "step": 1510 + }, + { + "epoch": 0.26, + "grad_norm": 14.860684083485195, + "learning_rate": 8.583569405099152e-06, + "loss": 1.3933, + "step": 1515 + }, + { + "epoch": 0.26, + "grad_norm": 16.67764945655357, + "learning_rate": 8.611898016997167e-06, + "loss": 1.3912, + "step": 1520 + }, + { + "epoch": 0.26, + "grad_norm": 14.069808432652248, + "learning_rate": 8.640226628895185e-06, + "loss": 1.3796, + "step": 1525 + }, + { + "epoch": 0.26, + "grad_norm": 65.37764259474062, + "learning_rate": 8.668555240793202e-06, + "loss": 1.3854, + "step": 1530 + }, + { + "epoch": 0.26, + "grad_norm": 67.37196707064975, + "learning_rate": 8.696883852691219e-06, + "loss": 1.3932, + "step": 1535 + }, + { + "epoch": 0.26, + "grad_norm": 26.53654854865448, + "learning_rate": 8.725212464589235e-06, + "loss": 1.3971, + "step": 1540 + }, + { + "epoch": 0.26, + "grad_norm": 120.33111697552754, + "learning_rate": 8.753541076487254e-06, + "loss": 1.3925, + "step": 1545 + }, + { + "epoch": 0.26, + "grad_norm": 26.033106458020956, + "learning_rate": 8.78186968838527e-06, + "loss": 1.3924, + "step": 1550 + }, + { + "epoch": 0.26, + "grad_norm": 127.2750601403478, + "learning_rate": 8.810198300283287e-06, + "loss": 1.4086, + "step": 1555 + }, + { + "epoch": 0.27, + "grad_norm": 97.35619667630274, + "learning_rate": 8.838526912181304e-06, + "loss": 1.4252, + "step": 1560 + }, + { + "epoch": 0.27, + "grad_norm": 41.5132287656772, + "learning_rate": 8.86685552407932e-06, + "loss": 1.4048, + "step": 1565 + }, + { + "epoch": 0.27, + "grad_norm": 20.19314986884624, + "learning_rate": 8.895184135977339e-06, + "loss": 1.4018, + "step": 1570 + }, + { + "epoch": 0.27, + "grad_norm": 20.207399962119673, + "learning_rate": 8.923512747875355e-06, + "loss": 1.3928, + "step": 1575 + }, + { + "epoch": 0.27, + "grad_norm": 18.706236409485633, + "learning_rate": 8.951841359773372e-06, + "loss": 1.402, + "step": 1580 + }, + { + "epoch": 0.27, + "grad_norm": 18.15951416173616, + "learning_rate": 8.980169971671388e-06, + "loss": 1.3589, + "step": 1585 + }, + { + "epoch": 0.27, + "grad_norm": 19.959848444063436, + "learning_rate": 9.008498583569407e-06, + "loss": 1.3918, + "step": 1590 + }, + { + "epoch": 0.27, + "grad_norm": 18.03318135371038, + "learning_rate": 9.036827195467422e-06, + "loss": 1.3908, + "step": 1595 + }, + { + "epoch": 0.27, + "grad_norm": 59.051519885278694, + "learning_rate": 9.06515580736544e-06, + "loss": 1.3731, + "step": 1600 + }, + { + "epoch": 0.27, + "grad_norm": 52.467108769254416, + "learning_rate": 9.093484419263457e-06, + "loss": 1.3487, + "step": 1605 + }, + { + "epoch": 0.27, + "grad_norm": 59.018311552807326, + "learning_rate": 9.121813031161473e-06, + "loss": 1.377, + "step": 1610 + }, + { + "epoch": 0.27, + "grad_norm": 14.959473760297636, + "learning_rate": 9.150141643059492e-06, + "loss": 1.4018, + "step": 1615 + }, + { + "epoch": 0.28, + "grad_norm": 35.688486380413146, + "learning_rate": 9.178470254957508e-06, + "loss": 1.3702, + "step": 1620 + }, + { + "epoch": 0.28, + "grad_norm": 17.702780585065685, + "learning_rate": 9.206798866855525e-06, + "loss": 1.3887, + "step": 1625 + }, + { + "epoch": 0.28, + "grad_norm": 44.95672001007261, + "learning_rate": 9.235127478753542e-06, + "loss": 1.3852, + "step": 1630 + }, + { + "epoch": 0.28, + "grad_norm": 23.805509750982903, + "learning_rate": 9.26345609065156e-06, + "loss": 1.3646, + "step": 1635 + }, + { + "epoch": 0.28, + "grad_norm": 39.33476110738001, + "learning_rate": 9.291784702549575e-06, + "loss": 1.3764, + "step": 1640 + }, + { + "epoch": 0.28, + "grad_norm": 12.540226055124654, + "learning_rate": 9.320113314447593e-06, + "loss": 1.3633, + "step": 1645 + }, + { + "epoch": 0.28, + "grad_norm": 60.937887345190425, + "learning_rate": 9.34844192634561e-06, + "loss": 1.3941, + "step": 1650 + }, + { + "epoch": 0.28, + "grad_norm": 15.183842224404986, + "learning_rate": 9.376770538243626e-06, + "loss": 1.3632, + "step": 1655 + }, + { + "epoch": 0.28, + "grad_norm": 13.268626818286448, + "learning_rate": 9.405099150141643e-06, + "loss": 1.3632, + "step": 1660 + }, + { + "epoch": 0.28, + "grad_norm": 14.977488338411256, + "learning_rate": 9.433427762039661e-06, + "loss": 1.3437, + "step": 1665 + }, + { + "epoch": 0.28, + "grad_norm": 17.69425703983086, + "learning_rate": 9.461756373937678e-06, + "loss": 1.3407, + "step": 1670 + }, + { + "epoch": 0.28, + "grad_norm": 50.81269776992468, + "learning_rate": 9.490084985835695e-06, + "loss": 1.3649, + "step": 1675 + }, + { + "epoch": 0.29, + "grad_norm": 52.987834036106406, + "learning_rate": 9.518413597733713e-06, + "loss": 1.368, + "step": 1680 + }, + { + "epoch": 0.29, + "grad_norm": 53.09975091644179, + "learning_rate": 9.546742209631728e-06, + "loss": 1.3753, + "step": 1685 + }, + { + "epoch": 0.29, + "grad_norm": 65.42753190343696, + "learning_rate": 9.575070821529746e-06, + "loss": 1.3373, + "step": 1690 + }, + { + "epoch": 0.29, + "grad_norm": 12.50878001715189, + "learning_rate": 9.603399433427763e-06, + "loss": 1.3204, + "step": 1695 + }, + { + "epoch": 0.29, + "grad_norm": 51.46300322685196, + "learning_rate": 9.63172804532578e-06, + "loss": 1.3517, + "step": 1700 + }, + { + "epoch": 0.29, + "grad_norm": 50.08389878971014, + "learning_rate": 9.660056657223796e-06, + "loss": 1.3377, + "step": 1705 + }, + { + "epoch": 0.29, + "grad_norm": 98.7501586904867, + "learning_rate": 9.688385269121814e-06, + "loss": 1.358, + "step": 1710 + }, + { + "epoch": 0.29, + "grad_norm": 25.256204978294605, + "learning_rate": 9.716713881019831e-06, + "loss": 1.3391, + "step": 1715 + }, + { + "epoch": 0.29, + "grad_norm": 96.05571532903846, + "learning_rate": 9.745042492917848e-06, + "loss": 1.3615, + "step": 1720 + }, + { + "epoch": 0.29, + "grad_norm": 26.755333256803034, + "learning_rate": 9.773371104815866e-06, + "loss": 1.3662, + "step": 1725 + }, + { + "epoch": 0.29, + "grad_norm": 25.837535923418997, + "learning_rate": 9.801699716713881e-06, + "loss": 1.38, + "step": 1730 + }, + { + "epoch": 0.29, + "grad_norm": 24.738456192454084, + "learning_rate": 9.8300283286119e-06, + "loss": 1.333, + "step": 1735 + }, + { + "epoch": 0.3, + "grad_norm": 41.26197817069162, + "learning_rate": 9.858356940509916e-06, + "loss": 1.3491, + "step": 1740 + }, + { + "epoch": 0.3, + "grad_norm": 51.99630683152431, + "learning_rate": 9.886685552407933e-06, + "loss": 1.3429, + "step": 1745 + }, + { + "epoch": 0.3, + "grad_norm": 46.34281673405669, + "learning_rate": 9.91501416430595e-06, + "loss": 1.3646, + "step": 1750 + }, + { + "epoch": 0.3, + "grad_norm": 22.857745511222603, + "learning_rate": 9.943342776203968e-06, + "loss": 1.3297, + "step": 1755 + }, + { + "epoch": 0.3, + "grad_norm": 11.712863491665592, + "learning_rate": 9.971671388101982e-06, + "loss": 1.321, + "step": 1760 + }, + { + "epoch": 0.3, + "grad_norm": 11.7261894236906, + "learning_rate": 1e-05, + "loss": 1.3444, + "step": 1765 + }, + { + "epoch": 0.3, + "grad_norm": 32.60283769493224, + "learning_rate": 9.99999755418257e-06, + "loss": 1.3408, + "step": 1770 + }, + { + "epoch": 0.3, + "grad_norm": 17.203512310970503, + "learning_rate": 9.999990216732668e-06, + "loss": 1.3446, + "step": 1775 + }, + { + "epoch": 0.3, + "grad_norm": 17.883185421539476, + "learning_rate": 9.999977987657479e-06, + "loss": 1.3313, + "step": 1780 + }, + { + "epoch": 0.3, + "grad_norm": 19.323697826239748, + "learning_rate": 9.99996086696896e-06, + "loss": 1.3503, + "step": 1785 + }, + { + "epoch": 0.3, + "grad_norm": 27.72449057867512, + "learning_rate": 9.999938854683867e-06, + "loss": 1.3201, + "step": 1790 + }, + { + "epoch": 0.31, + "grad_norm": 35.22723944266936, + "learning_rate": 9.99991195082373e-06, + "loss": 1.3395, + "step": 1795 + }, + { + "epoch": 0.31, + "grad_norm": 36.374853638958115, + "learning_rate": 9.999880155414872e-06, + "loss": 1.34, + "step": 1800 + }, + { + "epoch": 0.31, + "grad_norm": 29.11298844229375, + "learning_rate": 9.9998434684884e-06, + "loss": 1.3075, + "step": 1805 + }, + { + "epoch": 0.31, + "grad_norm": 18.978651122935727, + "learning_rate": 9.999801890080203e-06, + "loss": 1.3248, + "step": 1810 + }, + { + "epoch": 0.31, + "grad_norm": 62.277032520069206, + "learning_rate": 9.999755420230964e-06, + "loss": 1.3174, + "step": 1815 + }, + { + "epoch": 0.31, + "grad_norm": 42.25047939113886, + "learning_rate": 9.999704058986139e-06, + "loss": 1.3196, + "step": 1820 + }, + { + "epoch": 0.31, + "grad_norm": 121.34773754544872, + "learning_rate": 9.99964780639598e-06, + "loss": 1.3449, + "step": 1825 + }, + { + "epoch": 0.31, + "grad_norm": 157.875932629027, + "learning_rate": 9.999586662515519e-06, + "loss": 1.3766, + "step": 1830 + }, + { + "epoch": 0.31, + "grad_norm": 67.21735658366185, + "learning_rate": 9.999520627404576e-06, + "loss": 1.3336, + "step": 1835 + }, + { + "epoch": 0.31, + "grad_norm": 40.32573656282714, + "learning_rate": 9.999449701127753e-06, + "loss": 1.3358, + "step": 1840 + }, + { + "epoch": 0.31, + "grad_norm": 16.578779780453953, + "learning_rate": 9.999373883754442e-06, + "loss": 1.3263, + "step": 1845 + }, + { + "epoch": 0.31, + "grad_norm": 45.13578422606087, + "learning_rate": 9.999293175358814e-06, + "loss": 1.3156, + "step": 1850 + }, + { + "epoch": 0.32, + "grad_norm": 25.271194035753734, + "learning_rate": 9.99920757601983e-06, + "loss": 1.3158, + "step": 1855 + }, + { + "epoch": 0.32, + "grad_norm": 39.94426166106633, + "learning_rate": 9.999117085821233e-06, + "loss": 1.323, + "step": 1860 + }, + { + "epoch": 0.32, + "grad_norm": 30.457883465258536, + "learning_rate": 9.999021704851555e-06, + "loss": 1.3516, + "step": 1865 + }, + { + "epoch": 0.32, + "grad_norm": 36.9726912711344, + "learning_rate": 9.998921433204106e-06, + "loss": 1.333, + "step": 1870 + }, + { + "epoch": 0.32, + "grad_norm": 23.030928072688916, + "learning_rate": 9.998816270976985e-06, + "loss": 1.2991, + "step": 1875 + }, + { + "epoch": 0.32, + "grad_norm": 47.126156393517775, + "learning_rate": 9.998706218273078e-06, + "loss": 1.2744, + "step": 1880 + }, + { + "epoch": 0.32, + "grad_norm": 32.59637028245757, + "learning_rate": 9.998591275200049e-06, + "loss": 1.3096, + "step": 1885 + }, + { + "epoch": 0.32, + "grad_norm": 23.94926147678152, + "learning_rate": 9.998471441870353e-06, + "loss": 1.2996, + "step": 1890 + }, + { + "epoch": 0.32, + "grad_norm": 21.030622870893925, + "learning_rate": 9.998346718401222e-06, + "loss": 1.3209, + "step": 1895 + }, + { + "epoch": 0.32, + "grad_norm": 17.715711453727344, + "learning_rate": 9.998217104914683e-06, + "loss": 1.3006, + "step": 1900 + }, + { + "epoch": 0.32, + "grad_norm": 13.08367409477122, + "learning_rate": 9.998082601537533e-06, + "loss": 1.3012, + "step": 1905 + }, + { + "epoch": 0.32, + "grad_norm": 13.183410345778563, + "learning_rate": 9.997943208401365e-06, + "loss": 1.3134, + "step": 1910 + }, + { + "epoch": 0.33, + "grad_norm": 13.086410225689482, + "learning_rate": 9.99779892564255e-06, + "loss": 1.2904, + "step": 1915 + }, + { + "epoch": 0.33, + "grad_norm": 37.16067338584074, + "learning_rate": 9.997649753402243e-06, + "loss": 1.3047, + "step": 1920 + }, + { + "epoch": 0.33, + "grad_norm": 40.474695223414116, + "learning_rate": 9.997495691826382e-06, + "loss": 1.2886, + "step": 1925 + }, + { + "epoch": 0.33, + "grad_norm": 53.203394356059604, + "learning_rate": 9.997336741065694e-06, + "loss": 1.2939, + "step": 1930 + }, + { + "epoch": 0.33, + "grad_norm": 43.19777784215109, + "learning_rate": 9.997172901275682e-06, + "loss": 1.2875, + "step": 1935 + }, + { + "epoch": 0.33, + "grad_norm": 65.5795185045484, + "learning_rate": 9.997004172616633e-06, + "loss": 1.3288, + "step": 1940 + }, + { + "epoch": 0.33, + "grad_norm": 50.496217939819, + "learning_rate": 9.996830555253622e-06, + "loss": 1.3044, + "step": 1945 + }, + { + "epoch": 0.33, + "grad_norm": 16.159352252412273, + "learning_rate": 9.9966520493565e-06, + "loss": 1.3244, + "step": 1950 + }, + { + "epoch": 0.33, + "grad_norm": 14.449411392080844, + "learning_rate": 9.99646865509991e-06, + "loss": 1.3067, + "step": 1955 + }, + { + "epoch": 0.33, + "grad_norm": 30.85859582159372, + "learning_rate": 9.996280372663266e-06, + "loss": 1.2854, + "step": 1960 + }, + { + "epoch": 0.33, + "grad_norm": 13.954831400149821, + "learning_rate": 9.996087202230773e-06, + "loss": 1.2711, + "step": 1965 + }, + { + "epoch": 0.33, + "grad_norm": 13.535102116165461, + "learning_rate": 9.995889143991412e-06, + "loss": 1.2758, + "step": 1970 + }, + { + "epoch": 0.34, + "grad_norm": 14.05835385978514, + "learning_rate": 9.995686198138951e-06, + "loss": 1.2838, + "step": 1975 + }, + { + "epoch": 0.34, + "grad_norm": 27.158362157021386, + "learning_rate": 9.995478364871937e-06, + "loss": 1.2848, + "step": 1980 + }, + { + "epoch": 0.34, + "grad_norm": 17.607677347887147, + "learning_rate": 9.995265644393698e-06, + "loss": 1.3077, + "step": 1985 + }, + { + "epoch": 0.34, + "grad_norm": 12.019777361051082, + "learning_rate": 9.995048036912345e-06, + "loss": 1.2911, + "step": 1990 + }, + { + "epoch": 0.34, + "grad_norm": 36.439429993916605, + "learning_rate": 9.99482554264077e-06, + "loss": 1.2909, + "step": 1995 + }, + { + "epoch": 0.34, + "grad_norm": 19.316454115512972, + "learning_rate": 9.994598161796643e-06, + "loss": 1.28, + "step": 2000 + }, + { + "epoch": 0.34, + "grad_norm": 14.451990819041521, + "learning_rate": 9.994365894602419e-06, + "loss": 1.2796, + "step": 2005 + }, + { + "epoch": 0.34, + "grad_norm": 17.039745793623432, + "learning_rate": 9.994128741285329e-06, + "loss": 1.2776, + "step": 2010 + }, + { + "epoch": 0.34, + "grad_norm": 34.91032967270379, + "learning_rate": 9.99388670207739e-06, + "loss": 1.2953, + "step": 2015 + }, + { + "epoch": 0.34, + "grad_norm": 30.505391467678226, + "learning_rate": 9.99363977721539e-06, + "loss": 1.2912, + "step": 2020 + }, + { + "epoch": 0.34, + "grad_norm": 70.60877015993087, + "learning_rate": 9.993387966940908e-06, + "loss": 1.3002, + "step": 2025 + }, + { + "epoch": 0.35, + "grad_norm": 42.076694141065936, + "learning_rate": 9.993131271500293e-06, + "loss": 1.2742, + "step": 2030 + }, + { + "epoch": 0.35, + "grad_norm": 40.78685017799231, + "learning_rate": 9.992869691144679e-06, + "loss": 1.2764, + "step": 2035 + }, + { + "epoch": 0.35, + "grad_norm": 33.41521506875986, + "learning_rate": 9.992603226129978e-06, + "loss": 1.2883, + "step": 2040 + }, + { + "epoch": 0.35, + "grad_norm": 14.665923005413482, + "learning_rate": 9.992331876716877e-06, + "loss": 1.269, + "step": 2045 + }, + { + "epoch": 0.35, + "grad_norm": 35.13745352099027, + "learning_rate": 9.992055643170844e-06, + "loss": 1.2919, + "step": 2050 + }, + { + "epoch": 0.35, + "grad_norm": 93.57018198555726, + "learning_rate": 9.99177452576213e-06, + "loss": 1.2477, + "step": 2055 + }, + { + "epoch": 0.35, + "grad_norm": 73.14188594561246, + "learning_rate": 9.991488524765755e-06, + "loss": 1.2727, + "step": 2060 + }, + { + "epoch": 0.35, + "grad_norm": 60.954485489252725, + "learning_rate": 9.991197640461527e-06, + "loss": 1.2804, + "step": 2065 + }, + { + "epoch": 0.35, + "grad_norm": 49.16627669625653, + "learning_rate": 9.99090187313402e-06, + "loss": 1.2764, + "step": 2070 + }, + { + "epoch": 0.35, + "grad_norm": 71.11546717627556, + "learning_rate": 9.990601223072596e-06, + "loss": 1.2999, + "step": 2075 + }, + { + "epoch": 0.35, + "grad_norm": 20.860674350454783, + "learning_rate": 9.990295690571386e-06, + "loss": 1.2745, + "step": 2080 + }, + { + "epoch": 0.35, + "grad_norm": 30.0270032458091, + "learning_rate": 9.989985275929302e-06, + "loss": 1.2672, + "step": 2085 + }, + { + "epoch": 0.36, + "grad_norm": 48.4403765685246, + "learning_rate": 9.98966997945003e-06, + "loss": 1.2713, + "step": 2090 + }, + { + "epoch": 0.36, + "grad_norm": 16.184895143802937, + "learning_rate": 9.989349801442034e-06, + "loss": 1.2447, + "step": 2095 + }, + { + "epoch": 0.36, + "grad_norm": 17.326803432440844, + "learning_rate": 9.989024742218554e-06, + "loss": 1.2718, + "step": 2100 + }, + { + "epoch": 0.36, + "grad_norm": 22.212522451101893, + "learning_rate": 9.9886948020976e-06, + "loss": 1.2692, + "step": 2105 + }, + { + "epoch": 0.36, + "grad_norm": 11.955909550882618, + "learning_rate": 9.988359981401967e-06, + "loss": 1.2632, + "step": 2110 + }, + { + "epoch": 0.36, + "grad_norm": 54.16029859047456, + "learning_rate": 9.988020280459214e-06, + "loss": 1.2474, + "step": 2115 + }, + { + "epoch": 0.36, + "grad_norm": 18.99762681137852, + "learning_rate": 9.987675699601682e-06, + "loss": 1.2878, + "step": 2120 + }, + { + "epoch": 0.36, + "grad_norm": 10.986794193093745, + "learning_rate": 9.987326239166484e-06, + "loss": 1.2581, + "step": 2125 + }, + { + "epoch": 0.36, + "grad_norm": 41.18160853788803, + "learning_rate": 9.986971899495505e-06, + "loss": 1.2505, + "step": 2130 + }, + { + "epoch": 0.36, + "grad_norm": 22.73713386704341, + "learning_rate": 9.986612680935409e-06, + "loss": 1.2369, + "step": 2135 + }, + { + "epoch": 0.36, + "grad_norm": 39.94586605054518, + "learning_rate": 9.986248583837624e-06, + "loss": 1.2506, + "step": 2140 + }, + { + "epoch": 0.36, + "grad_norm": 68.00261940817953, + "learning_rate": 9.985879608558359e-06, + "loss": 1.2386, + "step": 2145 + }, + { + "epoch": 0.37, + "grad_norm": 25.278427620899553, + "learning_rate": 9.98550575545859e-06, + "loss": 1.2315, + "step": 2150 + }, + { + "epoch": 0.37, + "grad_norm": 58.48992267362782, + "learning_rate": 9.985127024904072e-06, + "loss": 1.2684, + "step": 2155 + }, + { + "epoch": 0.37, + "grad_norm": 107.22278634371908, + "learning_rate": 9.984743417265323e-06, + "loss": 1.261, + "step": 2160 + }, + { + "epoch": 0.37, + "grad_norm": 56.43898177359786, + "learning_rate": 9.984354932917639e-06, + "loss": 1.2578, + "step": 2165 + }, + { + "epoch": 0.37, + "grad_norm": 30.02405317886174, + "learning_rate": 9.983961572241085e-06, + "loss": 1.221, + "step": 2170 + }, + { + "epoch": 0.37, + "grad_norm": 21.331779746602443, + "learning_rate": 9.983563335620494e-06, + "loss": 1.2405, + "step": 2175 + }, + { + "epoch": 0.37, + "grad_norm": 43.31147042485862, + "learning_rate": 9.983160223445476e-06, + "loss": 1.2366, + "step": 2180 + }, + { + "epoch": 0.37, + "grad_norm": 10.029017686861732, + "learning_rate": 9.982752236110401e-06, + "loss": 1.2432, + "step": 2185 + }, + { + "epoch": 0.37, + "grad_norm": 14.894536079070498, + "learning_rate": 9.982339374014416e-06, + "loss": 1.2213, + "step": 2190 + }, + { + "epoch": 0.37, + "grad_norm": 52.58513573855897, + "learning_rate": 9.981921637561438e-06, + "loss": 1.2446, + "step": 2195 + }, + { + "epoch": 0.37, + "grad_norm": 148.5603971500644, + "learning_rate": 9.981499027160147e-06, + "loss": 1.2624, + "step": 2200 + }, + { + "epoch": 0.37, + "grad_norm": 67.61072411459752, + "learning_rate": 9.981071543223992e-06, + "loss": 1.2677, + "step": 2205 + }, + { + "epoch": 0.38, + "grad_norm": 54.65073271042123, + "learning_rate": 9.980639186171198e-06, + "loss": 1.2462, + "step": 2210 + }, + { + "epoch": 0.38, + "grad_norm": 38.85335665696665, + "learning_rate": 9.980201956424748e-06, + "loss": 1.2708, + "step": 2215 + }, + { + "epoch": 0.38, + "grad_norm": 13.33642147794773, + "learning_rate": 9.979759854412395e-06, + "loss": 1.2643, + "step": 2220 + }, + { + "epoch": 0.38, + "grad_norm": 11.980103706134027, + "learning_rate": 9.97931288056666e-06, + "loss": 1.2406, + "step": 2225 + }, + { + "epoch": 0.38, + "grad_norm": 9.425628724046566, + "learning_rate": 9.978861035324832e-06, + "loss": 1.217, + "step": 2230 + }, + { + "epoch": 0.38, + "grad_norm": 46.170020521427176, + "learning_rate": 9.97840431912896e-06, + "loss": 1.24, + "step": 2235 + }, + { + "epoch": 0.38, + "grad_norm": 20.23431894994783, + "learning_rate": 9.977942732425862e-06, + "loss": 1.2447, + "step": 2240 + }, + { + "epoch": 0.38, + "grad_norm": 29.45427587924372, + "learning_rate": 9.977476275667123e-06, + "loss": 1.2285, + "step": 2245 + }, + { + "epoch": 0.38, + "grad_norm": 21.476899810487403, + "learning_rate": 9.97700494930909e-06, + "loss": 1.1955, + "step": 2250 + }, + { + "epoch": 0.38, + "grad_norm": 18.56756736605288, + "learning_rate": 9.976528753812874e-06, + "loss": 1.2215, + "step": 2255 + }, + { + "epoch": 0.38, + "grad_norm": 14.963469861084638, + "learning_rate": 9.976047689644345e-06, + "loss": 1.2168, + "step": 2260 + }, + { + "epoch": 0.39, + "grad_norm": 37.7498655059393, + "learning_rate": 9.97556175727415e-06, + "loss": 1.233, + "step": 2265 + }, + { + "epoch": 0.39, + "grad_norm": 11.34730041668875, + "learning_rate": 9.975070957177681e-06, + "loss": 1.2258, + "step": 2270 + }, + { + "epoch": 0.39, + "grad_norm": 29.797887323807636, + "learning_rate": 9.974575289835107e-06, + "loss": 1.2487, + "step": 2275 + }, + { + "epoch": 0.39, + "grad_norm": 10.355285431034126, + "learning_rate": 9.974074755731351e-06, + "loss": 1.2322, + "step": 2280 + }, + { + "epoch": 0.39, + "grad_norm": 11.54182670812362, + "learning_rate": 9.973569355356099e-06, + "loss": 1.2535, + "step": 2285 + }, + { + "epoch": 0.39, + "grad_norm": 21.753113355014868, + "learning_rate": 9.973059089203797e-06, + "loss": 1.2257, + "step": 2290 + }, + { + "epoch": 0.39, + "grad_norm": 13.018327238321696, + "learning_rate": 9.972543957773653e-06, + "loss": 1.2411, + "step": 2295 + }, + { + "epoch": 0.39, + "grad_norm": 11.70691445466828, + "learning_rate": 9.972023961569632e-06, + "loss": 1.226, + "step": 2300 + }, + { + "epoch": 0.39, + "grad_norm": 40.42682718423413, + "learning_rate": 9.971499101100463e-06, + "loss": 1.2181, + "step": 2305 + }, + { + "epoch": 0.39, + "grad_norm": 51.091493759604035, + "learning_rate": 9.97096937687963e-06, + "loss": 1.2247, + "step": 2310 + }, + { + "epoch": 0.39, + "grad_norm": 48.195350237791374, + "learning_rate": 9.970434789425378e-06, + "loss": 1.2125, + "step": 2315 + }, + { + "epoch": 0.39, + "grad_norm": 17.38647273683228, + "learning_rate": 9.969895339260706e-06, + "loss": 1.2069, + "step": 2320 + }, + { + "epoch": 0.4, + "grad_norm": 30.49139360025075, + "learning_rate": 9.969351026913375e-06, + "loss": 1.2194, + "step": 2325 + }, + { + "epoch": 0.4, + "grad_norm": 15.385511153165469, + "learning_rate": 9.968801852915897e-06, + "loss": 1.2209, + "step": 2330 + }, + { + "epoch": 0.4, + "grad_norm": 22.600235098601413, + "learning_rate": 9.968247817805548e-06, + "loss": 1.233, + "step": 2335 + }, + { + "epoch": 0.4, + "grad_norm": 25.51632451097952, + "learning_rate": 9.967688922124351e-06, + "loss": 1.2234, + "step": 2340 + }, + { + "epoch": 0.4, + "grad_norm": 12.458533234384973, + "learning_rate": 9.967125166419092e-06, + "loss": 1.2337, + "step": 2345 + }, + { + "epoch": 0.4, + "grad_norm": 15.137901496985272, + "learning_rate": 9.966556551241307e-06, + "loss": 1.2252, + "step": 2350 + }, + { + "epoch": 0.4, + "grad_norm": 13.554021202381733, + "learning_rate": 9.965983077147287e-06, + "loss": 1.232, + "step": 2355 + }, + { + "epoch": 0.4, + "grad_norm": 29.255477281363408, + "learning_rate": 9.965404744698081e-06, + "loss": 1.223, + "step": 2360 + }, + { + "epoch": 0.4, + "grad_norm": 55.71066517356006, + "learning_rate": 9.964821554459482e-06, + "loss": 1.211, + "step": 2365 + }, + { + "epoch": 0.4, + "grad_norm": 46.97293908156358, + "learning_rate": 9.964233507002044e-06, + "loss": 1.2306, + "step": 2370 + }, + { + "epoch": 0.4, + "grad_norm": 29.82807675197631, + "learning_rate": 9.963640602901069e-06, + "loss": 1.1983, + "step": 2375 + }, + { + "epoch": 0.4, + "grad_norm": 40.8106865600396, + "learning_rate": 9.96304284273661e-06, + "loss": 1.2362, + "step": 2380 + }, + { + "epoch": 0.41, + "grad_norm": 37.035382402104936, + "learning_rate": 9.962440227093474e-06, + "loss": 1.2233, + "step": 2385 + }, + { + "epoch": 0.41, + "grad_norm": 47.48410746333308, + "learning_rate": 9.961832756561213e-06, + "loss": 1.2354, + "step": 2390 + }, + { + "epoch": 0.41, + "grad_norm": 20.34844898900087, + "learning_rate": 9.961220431734137e-06, + "loss": 1.2215, + "step": 2395 + }, + { + "epoch": 0.41, + "grad_norm": 108.59661182177989, + "learning_rate": 9.960603253211295e-06, + "loss": 1.2327, + "step": 2400 + }, + { + "epoch": 0.41, + "grad_norm": 76.44086594519507, + "learning_rate": 9.95998122159649e-06, + "loss": 1.2229, + "step": 2405 + }, + { + "epoch": 0.41, + "grad_norm": 38.53725294367486, + "learning_rate": 9.959354337498274e-06, + "loss": 1.2166, + "step": 2410 + }, + { + "epoch": 0.41, + "grad_norm": 23.492069683488847, + "learning_rate": 9.958722601529945e-06, + "loss": 1.2349, + "step": 2415 + }, + { + "epoch": 0.41, + "grad_norm": 32.42759975420263, + "learning_rate": 9.958086014309545e-06, + "loss": 1.212, + "step": 2420 + }, + { + "epoch": 0.41, + "grad_norm": 19.57195181578406, + "learning_rate": 9.957444576459864e-06, + "loss": 1.2193, + "step": 2425 + }, + { + "epoch": 0.41, + "grad_norm": 26.68571032098248, + "learning_rate": 9.956798288608442e-06, + "loss": 1.2197, + "step": 2430 + }, + { + "epoch": 0.41, + "grad_norm": 23.360805598503173, + "learning_rate": 9.956147151387559e-06, + "loss": 1.233, + "step": 2435 + }, + { + "epoch": 0.41, + "grad_norm": 22.128646502976743, + "learning_rate": 9.955491165434236e-06, + "loss": 1.2141, + "step": 2440 + }, + { + "epoch": 0.42, + "grad_norm": 26.658087544079972, + "learning_rate": 9.954830331390245e-06, + "loss": 1.1987, + "step": 2445 + }, + { + "epoch": 0.42, + "grad_norm": 12.065272609028908, + "learning_rate": 9.954164649902097e-06, + "loss": 1.192, + "step": 2450 + }, + { + "epoch": 0.42, + "grad_norm": 17.279093623666416, + "learning_rate": 9.953494121621047e-06, + "loss": 1.2053, + "step": 2455 + }, + { + "epoch": 0.42, + "grad_norm": 24.282084124178198, + "learning_rate": 9.95281874720309e-06, + "loss": 1.1988, + "step": 2460 + }, + { + "epoch": 0.42, + "grad_norm": 44.50488285966649, + "learning_rate": 9.952138527308963e-06, + "loss": 1.208, + "step": 2465 + }, + { + "epoch": 0.42, + "grad_norm": 33.79862535916042, + "learning_rate": 9.951453462604143e-06, + "loss": 1.1916, + "step": 2470 + }, + { + "epoch": 0.42, + "grad_norm": 27.13073772350017, + "learning_rate": 9.950763553758848e-06, + "loss": 1.21, + "step": 2475 + }, + { + "epoch": 0.42, + "grad_norm": 49.80064601585301, + "learning_rate": 9.950068801448037e-06, + "loss": 1.2042, + "step": 2480 + }, + { + "epoch": 0.42, + "grad_norm": 66.21016098075907, + "learning_rate": 9.9493692063514e-06, + "loss": 1.2083, + "step": 2485 + }, + { + "epoch": 0.42, + "grad_norm": 26.587068635738756, + "learning_rate": 9.948664769153372e-06, + "loss": 1.2169, + "step": 2490 + }, + { + "epoch": 0.42, + "grad_norm": 10.940254770635514, + "learning_rate": 9.947955490543125e-06, + "loss": 1.197, + "step": 2495 + }, + { + "epoch": 0.43, + "grad_norm": 22.998827359689898, + "learning_rate": 9.94724137121456e-06, + "loss": 1.1862, + "step": 2500 + }, + { + "epoch": 0.43, + "grad_norm": 41.25760329284341, + "learning_rate": 9.946522411866325e-06, + "loss": 1.1847, + "step": 2505 + }, + { + "epoch": 0.43, + "grad_norm": 133.01481338781392, + "learning_rate": 9.945798613201794e-06, + "loss": 1.2163, + "step": 2510 + }, + { + "epoch": 0.43, + "grad_norm": 89.31490352272507, + "learning_rate": 9.94506997592908e-06, + "loss": 1.177, + "step": 2515 + }, + { + "epoch": 0.43, + "grad_norm": 30.656395841990992, + "learning_rate": 9.944336500761029e-06, + "loss": 1.1902, + "step": 2520 + }, + { + "epoch": 0.43, + "grad_norm": 40.71404633836761, + "learning_rate": 9.943598188415217e-06, + "loss": 1.1812, + "step": 2525 + }, + { + "epoch": 0.43, + "grad_norm": 44.483429975247155, + "learning_rate": 9.942855039613958e-06, + "loss": 1.2245, + "step": 2530 + }, + { + "epoch": 0.43, + "grad_norm": 59.00601624439954, + "learning_rate": 9.942107055084292e-06, + "loss": 1.1852, + "step": 2535 + }, + { + "epoch": 0.43, + "grad_norm": 48.899022645872606, + "learning_rate": 9.941354235557994e-06, + "loss": 1.1838, + "step": 2540 + }, + { + "epoch": 0.43, + "grad_norm": 17.207646488335833, + "learning_rate": 9.940596581771568e-06, + "loss": 1.2067, + "step": 2545 + }, + { + "epoch": 0.43, + "grad_norm": 13.145055187749728, + "learning_rate": 9.939834094466245e-06, + "loss": 1.2037, + "step": 2550 + }, + { + "epoch": 0.43, + "grad_norm": 20.3637903577066, + "learning_rate": 9.939066774387988e-06, + "loss": 1.2061, + "step": 2555 + }, + { + "epoch": 0.44, + "grad_norm": 23.27972184509164, + "learning_rate": 9.93829462228749e-06, + "loss": 1.1862, + "step": 2560 + }, + { + "epoch": 0.44, + "grad_norm": 15.504767354168685, + "learning_rate": 9.937517638920162e-06, + "loss": 1.1787, + "step": 2565 + }, + { + "epoch": 0.44, + "grad_norm": 18.882534891848458, + "learning_rate": 9.936735825046154e-06, + "loss": 1.1721, + "step": 2570 + }, + { + "epoch": 0.44, + "grad_norm": 47.85662609246532, + "learning_rate": 9.93594918143033e-06, + "loss": 1.1926, + "step": 2575 + }, + { + "epoch": 0.44, + "grad_norm": 80.08678346889124, + "learning_rate": 9.935157708842288e-06, + "loss": 1.1823, + "step": 2580 + }, + { + "epoch": 0.44, + "grad_norm": 28.614889713910266, + "learning_rate": 9.934361408056348e-06, + "loss": 1.1815, + "step": 2585 + }, + { + "epoch": 0.44, + "grad_norm": 53.20030002856704, + "learning_rate": 9.933560279851549e-06, + "loss": 1.2112, + "step": 2590 + }, + { + "epoch": 0.44, + "grad_norm": 56.59970823556055, + "learning_rate": 9.93275432501166e-06, + "loss": 1.1882, + "step": 2595 + }, + { + "epoch": 0.44, + "grad_norm": 43.98644204083481, + "learning_rate": 9.931943544325166e-06, + "loss": 1.16, + "step": 2600 + }, + { + "epoch": 0.44, + "grad_norm": 15.92441460995244, + "learning_rate": 9.931127938585275e-06, + "loss": 1.2017, + "step": 2605 + }, + { + "epoch": 0.44, + "grad_norm": 73.0494618190435, + "learning_rate": 9.930307508589918e-06, + "loss": 1.1923, + "step": 2610 + }, + { + "epoch": 0.44, + "grad_norm": 66.41961786413435, + "learning_rate": 9.929482255141744e-06, + "loss": 1.1775, + "step": 2615 + }, + { + "epoch": 0.45, + "grad_norm": 17.458880822215225, + "learning_rate": 9.928652179048119e-06, + "loss": 1.1606, + "step": 2620 + }, + { + "epoch": 0.45, + "grad_norm": 21.88590996455315, + "learning_rate": 9.927817281121131e-06, + "loss": 1.1666, + "step": 2625 + }, + { + "epoch": 0.45, + "grad_norm": 29.09234292628808, + "learning_rate": 9.926977562177582e-06, + "loss": 1.173, + "step": 2630 + }, + { + "epoch": 0.45, + "grad_norm": 34.55403219807346, + "learning_rate": 9.926133023038988e-06, + "loss": 1.184, + "step": 2635 + }, + { + "epoch": 0.45, + "grad_norm": 10.720705771918542, + "learning_rate": 9.92528366453159e-06, + "loss": 1.1818, + "step": 2640 + }, + { + "epoch": 0.45, + "grad_norm": 15.388664176736826, + "learning_rate": 9.924429487486339e-06, + "loss": 1.1724, + "step": 2645 + }, + { + "epoch": 0.45, + "grad_norm": 18.130272229112684, + "learning_rate": 9.923570492738895e-06, + "loss": 1.1772, + "step": 2650 + }, + { + "epoch": 0.45, + "grad_norm": 9.637369376811128, + "learning_rate": 9.922706681129634e-06, + "loss": 1.1873, + "step": 2655 + }, + { + "epoch": 0.45, + "grad_norm": 22.481508332457953, + "learning_rate": 9.921838053503654e-06, + "loss": 1.1841, + "step": 2660 + }, + { + "epoch": 0.45, + "grad_norm": 36.96789134182856, + "learning_rate": 9.92096461071075e-06, + "loss": 1.1646, + "step": 2665 + }, + { + "epoch": 0.45, + "grad_norm": 11.20424827349328, + "learning_rate": 9.920086353605439e-06, + "loss": 1.1683, + "step": 2670 + }, + { + "epoch": 0.45, + "grad_norm": 22.984932609600577, + "learning_rate": 9.919203283046942e-06, + "loss": 1.1832, + "step": 2675 + }, + { + "epoch": 0.46, + "grad_norm": 17.123529916442433, + "learning_rate": 9.918315399899188e-06, + "loss": 1.1692, + "step": 2680 + }, + { + "epoch": 0.46, + "grad_norm": 11.409240507891948, + "learning_rate": 9.91742270503082e-06, + "loss": 1.1767, + "step": 2685 + }, + { + "epoch": 0.46, + "grad_norm": 12.592745764863913, + "learning_rate": 9.916525199315186e-06, + "loss": 1.1824, + "step": 2690 + }, + { + "epoch": 0.46, + "grad_norm": 35.317900020424446, + "learning_rate": 9.91562288363034e-06, + "loss": 1.1766, + "step": 2695 + }, + { + "epoch": 0.46, + "grad_norm": 11.418561626974661, + "learning_rate": 9.91471575885904e-06, + "loss": 1.1722, + "step": 2700 + }, + { + "epoch": 0.46, + "grad_norm": 16.186458826087673, + "learning_rate": 9.91380382588875e-06, + "loss": 1.1511, + "step": 2705 + }, + { + "epoch": 0.46, + "grad_norm": 13.392876468515981, + "learning_rate": 9.912887085611642e-06, + "loss": 1.1582, + "step": 2710 + }, + { + "epoch": 0.46, + "grad_norm": 16.027146893897786, + "learning_rate": 9.911965538924584e-06, + "loss": 1.2027, + "step": 2715 + }, + { + "epoch": 0.46, + "grad_norm": 23.74651860266453, + "learning_rate": 9.911039186729154e-06, + "loss": 1.1886, + "step": 2720 + }, + { + "epoch": 0.46, + "grad_norm": 15.629032974183286, + "learning_rate": 9.910108029931622e-06, + "loss": 1.1849, + "step": 2725 + }, + { + "epoch": 0.46, + "grad_norm": 12.341384937928606, + "learning_rate": 9.90917206944297e-06, + "loss": 1.1755, + "step": 2730 + }, + { + "epoch": 0.46, + "grad_norm": 35.88736400267985, + "learning_rate": 9.908231306178869e-06, + "loss": 1.1932, + "step": 2735 + }, + { + "epoch": 0.47, + "grad_norm": 13.536749553802284, + "learning_rate": 9.907285741059694e-06, + "loss": 1.1806, + "step": 2740 + }, + { + "epoch": 0.47, + "grad_norm": 20.12273250818116, + "learning_rate": 9.906335375010518e-06, + "loss": 1.178, + "step": 2745 + }, + { + "epoch": 0.47, + "grad_norm": 28.476959265452525, + "learning_rate": 9.905380208961109e-06, + "loss": 1.1809, + "step": 2750 + }, + { + "epoch": 0.47, + "grad_norm": 22.670220119269143, + "learning_rate": 9.904420243845931e-06, + "loss": 1.1638, + "step": 2755 + }, + { + "epoch": 0.47, + "grad_norm": 66.52647296720023, + "learning_rate": 9.903455480604144e-06, + "loss": 1.1785, + "step": 2760 + }, + { + "epoch": 0.47, + "grad_norm": 65.14747317513056, + "learning_rate": 9.902485920179605e-06, + "loss": 1.1931, + "step": 2765 + }, + { + "epoch": 0.47, + "grad_norm": 35.62003805602506, + "learning_rate": 9.901511563520855e-06, + "loss": 1.1614, + "step": 2770 + }, + { + "epoch": 0.47, + "grad_norm": 39.14685246448072, + "learning_rate": 9.900532411581141e-06, + "loss": 1.1755, + "step": 2775 + }, + { + "epoch": 0.47, + "grad_norm": 89.34481105669401, + "learning_rate": 9.899548465318387e-06, + "loss": 1.1767, + "step": 2780 + }, + { + "epoch": 0.47, + "grad_norm": 52.59479557413615, + "learning_rate": 9.898559725695217e-06, + "loss": 1.1697, + "step": 2785 + }, + { + "epoch": 0.47, + "grad_norm": 35.276525624359245, + "learning_rate": 9.89756619367894e-06, + "loss": 1.1632, + "step": 2790 + }, + { + "epoch": 0.48, + "grad_norm": 11.966535745499472, + "learning_rate": 9.89656787024156e-06, + "loss": 1.16, + "step": 2795 + }, + { + "epoch": 0.48, + "grad_norm": 13.569210509013814, + "learning_rate": 9.895564756359758e-06, + "loss": 1.1749, + "step": 2800 + }, + { + "epoch": 0.48, + "grad_norm": 19.739276355409753, + "learning_rate": 9.89455685301491e-06, + "loss": 1.1714, + "step": 2805 + }, + { + "epoch": 0.48, + "grad_norm": 59.13841059108984, + "learning_rate": 9.893544161193077e-06, + "loss": 1.1816, + "step": 2810 + }, + { + "epoch": 0.48, + "grad_norm": 24.5159909442423, + "learning_rate": 9.892526681884997e-06, + "loss": 1.1617, + "step": 2815 + }, + { + "epoch": 0.48, + "grad_norm": 53.43412207505383, + "learning_rate": 9.891504416086102e-06, + "loss": 1.1698, + "step": 2820 + }, + { + "epoch": 0.48, + "grad_norm": 12.028092300763062, + "learning_rate": 9.890477364796502e-06, + "loss": 1.1462, + "step": 2825 + }, + { + "epoch": 0.48, + "grad_norm": 10.266849451796633, + "learning_rate": 9.889445529020989e-06, + "loss": 1.1739, + "step": 2830 + }, + { + "epoch": 0.48, + "grad_norm": 33.256972593357, + "learning_rate": 9.888408909769032e-06, + "loss": 1.1638, + "step": 2835 + }, + { + "epoch": 0.48, + "grad_norm": 13.240389968680208, + "learning_rate": 9.887367508054788e-06, + "loss": 1.1539, + "step": 2840 + }, + { + "epoch": 0.48, + "grad_norm": 38.32149327567404, + "learning_rate": 9.886321324897088e-06, + "loss": 1.1453, + "step": 2845 + }, + { + "epoch": 0.48, + "grad_norm": 17.92159888176316, + "learning_rate": 9.885270361319439e-06, + "loss": 1.129, + "step": 2850 + }, + { + "epoch": 0.49, + "grad_norm": 51.068604121950585, + "learning_rate": 9.884214618350028e-06, + "loss": 1.1633, + "step": 2855 + }, + { + "epoch": 0.49, + "grad_norm": 74.99666351301994, + "learning_rate": 9.883154097021717e-06, + "loss": 1.158, + "step": 2860 + }, + { + "epoch": 0.49, + "grad_norm": 30.31376407102048, + "learning_rate": 9.882088798372043e-06, + "loss": 1.1438, + "step": 2865 + }, + { + "epoch": 0.49, + "grad_norm": 27.670642676177405, + "learning_rate": 9.881018723443214e-06, + "loss": 1.1429, + "step": 2870 + }, + { + "epoch": 0.49, + "grad_norm": 23.64857996752343, + "learning_rate": 9.879943873282116e-06, + "loss": 1.1514, + "step": 2875 + }, + { + "epoch": 0.49, + "grad_norm": 13.005128955967104, + "learning_rate": 9.878864248940304e-06, + "loss": 1.1595, + "step": 2880 + }, + { + "epoch": 0.49, + "grad_norm": 8.966016821913678, + "learning_rate": 9.877779851474003e-06, + "loss": 1.1417, + "step": 2885 + }, + { + "epoch": 0.49, + "grad_norm": 12.446956221399299, + "learning_rate": 9.876690681944107e-06, + "loss": 1.1525, + "step": 2890 + }, + { + "epoch": 0.49, + "grad_norm": 9.693284053412812, + "learning_rate": 9.87559674141618e-06, + "loss": 1.1077, + "step": 2895 + }, + { + "epoch": 0.49, + "grad_norm": 17.179007907357477, + "learning_rate": 9.874498030960455e-06, + "loss": 1.1408, + "step": 2900 + }, + { + "epoch": 0.49, + "grad_norm": 33.083858737896215, + "learning_rate": 9.87339455165183e-06, + "loss": 1.1454, + "step": 2905 + }, + { + "epoch": 0.49, + "grad_norm": 79.29658401277891, + "learning_rate": 9.872286304569867e-06, + "loss": 1.1322, + "step": 2910 + }, + { + "epoch": 0.5, + "grad_norm": 20.385160626134113, + "learning_rate": 9.871173290798795e-06, + "loss": 1.1611, + "step": 2915 + }, + { + "epoch": 0.5, + "grad_norm": 10.773144522385966, + "learning_rate": 9.870055511427507e-06, + "loss": 1.1643, + "step": 2920 + }, + { + "epoch": 0.5, + "grad_norm": 16.312322239246836, + "learning_rate": 9.868932967549554e-06, + "loss": 1.1377, + "step": 2925 + }, + { + "epoch": 0.5, + "grad_norm": 23.14049593920567, + "learning_rate": 9.867805660263152e-06, + "loss": 1.1327, + "step": 2930 + }, + { + "epoch": 0.5, + "grad_norm": 19.77386440226736, + "learning_rate": 9.866673590671176e-06, + "loss": 1.1358, + "step": 2935 + }, + { + "epoch": 0.5, + "grad_norm": 30.710807933209413, + "learning_rate": 9.86553675988116e-06, + "loss": 1.1293, + "step": 2940 + }, + { + "epoch": 0.5, + "grad_norm": 9.894878262098294, + "learning_rate": 9.864395169005297e-06, + "loss": 1.1428, + "step": 2945 + }, + { + "epoch": 0.5, + "grad_norm": 60.79458334408716, + "learning_rate": 9.863248819160436e-06, + "loss": 1.1302, + "step": 2950 + }, + { + "epoch": 0.5, + "grad_norm": 123.22513760731647, + "learning_rate": 9.862097711468082e-06, + "loss": 1.1408, + "step": 2955 + }, + { + "epoch": 0.5, + "grad_norm": 12.980769182017495, + "learning_rate": 9.860941847054394e-06, + "loss": 1.176, + "step": 2960 + }, + { + "epoch": 0.5, + "grad_norm": 77.79656950182576, + "learning_rate": 9.859781227050186e-06, + "loss": 1.1614, + "step": 2965 + }, + { + "epoch": 0.5, + "grad_norm": 45.821558359257104, + "learning_rate": 9.858615852590923e-06, + "loss": 1.1402, + "step": 2970 + }, + { + "epoch": 0.51, + "grad_norm": 15.431782798306614, + "learning_rate": 9.857445724816723e-06, + "loss": 1.1479, + "step": 2975 + }, + { + "epoch": 0.51, + "grad_norm": 18.816255936865193, + "learning_rate": 9.856270844872355e-06, + "loss": 1.1414, + "step": 2980 + }, + { + "epoch": 0.51, + "grad_norm": 10.78799968669382, + "learning_rate": 9.855091213907233e-06, + "loss": 1.169, + "step": 2985 + }, + { + "epoch": 0.51, + "grad_norm": 20.618619597391554, + "learning_rate": 9.853906833075424e-06, + "loss": 1.1364, + "step": 2990 + }, + { + "epoch": 0.51, + "grad_norm": 29.602152847998962, + "learning_rate": 9.852717703535637e-06, + "loss": 1.1313, + "step": 2995 + }, + { + "epoch": 0.51, + "grad_norm": 20.41523192024887, + "learning_rate": 9.85152382645123e-06, + "loss": 1.13, + "step": 3000 + }, + { + "epoch": 0.51, + "grad_norm": 37.823239974778716, + "learning_rate": 9.85032520299021e-06, + "loss": 1.1508, + "step": 3005 + }, + { + "epoch": 0.51, + "grad_norm": 43.2473952972184, + "learning_rate": 9.849121834325216e-06, + "loss": 1.1316, + "step": 3010 + }, + { + "epoch": 0.51, + "grad_norm": 52.74014485034811, + "learning_rate": 9.847913721633541e-06, + "loss": 1.127, + "step": 3015 + }, + { + "epoch": 0.51, + "grad_norm": 11.225887741481749, + "learning_rate": 9.846700866097111e-06, + "loss": 1.1283, + "step": 3020 + }, + { + "epoch": 0.51, + "grad_norm": 22.830528962299184, + "learning_rate": 9.845483268902497e-06, + "loss": 1.1332, + "step": 3025 + }, + { + "epoch": 0.52, + "grad_norm": 9.962513124404307, + "learning_rate": 9.844260931240908e-06, + "loss": 1.1192, + "step": 3030 + }, + { + "epoch": 0.52, + "grad_norm": 10.075114486387971, + "learning_rate": 9.843033854308189e-06, + "loss": 1.1234, + "step": 3035 + }, + { + "epoch": 0.52, + "grad_norm": 43.92726259157927, + "learning_rate": 9.841802039304819e-06, + "loss": 1.1325, + "step": 3040 + }, + { + "epoch": 0.52, + "grad_norm": 10.602847875408965, + "learning_rate": 9.84056548743592e-06, + "loss": 1.0945, + "step": 3045 + }, + { + "epoch": 0.52, + "grad_norm": 72.84853493289299, + "learning_rate": 9.839324199911244e-06, + "loss": 1.1146, + "step": 3050 + }, + { + "epoch": 0.52, + "grad_norm": 64.59634241144452, + "learning_rate": 9.838078177945174e-06, + "loss": 1.1265, + "step": 3055 + }, + { + "epoch": 0.52, + "grad_norm": 13.606427097030371, + "learning_rate": 9.83682742275673e-06, + "loss": 1.1458, + "step": 3060 + }, + { + "epoch": 0.52, + "grad_norm": 22.74960141201855, + "learning_rate": 9.835571935569554e-06, + "loss": 1.1276, + "step": 3065 + }, + { + "epoch": 0.52, + "grad_norm": 14.872212344467762, + "learning_rate": 9.834311717611929e-06, + "loss": 1.1275, + "step": 3070 + }, + { + "epoch": 0.52, + "grad_norm": 26.848450408931033, + "learning_rate": 9.833046770116758e-06, + "loss": 1.1178, + "step": 3075 + }, + { + "epoch": 0.52, + "grad_norm": 39.22359304839263, + "learning_rate": 9.831777094321572e-06, + "loss": 1.1238, + "step": 3080 + }, + { + "epoch": 0.52, + "grad_norm": 38.21441517690827, + "learning_rate": 9.83050269146853e-06, + "loss": 1.13, + "step": 3085 + }, + { + "epoch": 0.53, + "grad_norm": 71.39821885265845, + "learning_rate": 9.829223562804415e-06, + "loss": 1.1276, + "step": 3090 + }, + { + "epoch": 0.53, + "grad_norm": 69.47267188756749, + "learning_rate": 9.827939709580631e-06, + "loss": 1.1404, + "step": 3095 + }, + { + "epoch": 0.53, + "grad_norm": 27.2120383573781, + "learning_rate": 9.82665113305321e-06, + "loss": 1.1328, + "step": 3100 + }, + { + "epoch": 0.53, + "grad_norm": 17.3549164827145, + "learning_rate": 9.8253578344828e-06, + "loss": 1.0994, + "step": 3105 + }, + { + "epoch": 0.53, + "grad_norm": 26.975780544067167, + "learning_rate": 9.824059815134668e-06, + "loss": 1.114, + "step": 3110 + }, + { + "epoch": 0.53, + "grad_norm": 10.89034671963636, + "learning_rate": 9.822757076278701e-06, + "loss": 1.127, + "step": 3115 + }, + { + "epoch": 0.53, + "grad_norm": 18.784217590750867, + "learning_rate": 9.821449619189407e-06, + "loss": 1.1127, + "step": 3120 + }, + { + "epoch": 0.53, + "grad_norm": 16.672619503726025, + "learning_rate": 9.820137445145903e-06, + "loss": 1.1257, + "step": 3125 + }, + { + "epoch": 0.53, + "grad_norm": 33.283960901757666, + "learning_rate": 9.818820555431928e-06, + "loss": 1.1119, + "step": 3130 + }, + { + "epoch": 0.53, + "grad_norm": 20.205326596044547, + "learning_rate": 9.817498951335827e-06, + "loss": 1.1172, + "step": 3135 + }, + { + "epoch": 0.53, + "grad_norm": 38.647807030435, + "learning_rate": 9.81617263415056e-06, + "loss": 1.1332, + "step": 3140 + }, + { + "epoch": 0.53, + "grad_norm": 14.620577977257334, + "learning_rate": 9.814841605173704e-06, + "loss": 1.0901, + "step": 3145 + }, + { + "epoch": 0.54, + "grad_norm": 30.05892374289853, + "learning_rate": 9.813505865707437e-06, + "loss": 1.1164, + "step": 3150 + }, + { + "epoch": 0.54, + "grad_norm": 22.96909488857937, + "learning_rate": 9.81216541705855e-06, + "loss": 1.1051, + "step": 3155 + }, + { + "epoch": 0.54, + "grad_norm": 18.269660698493023, + "learning_rate": 9.810820260538441e-06, + "loss": 1.1375, + "step": 3160 + }, + { + "epoch": 0.54, + "grad_norm": 13.772284348648533, + "learning_rate": 9.809470397463112e-06, + "loss": 1.116, + "step": 3165 + }, + { + "epoch": 0.54, + "grad_norm": 10.855531811848072, + "learning_rate": 9.808115829153169e-06, + "loss": 1.1404, + "step": 3170 + }, + { + "epoch": 0.54, + "grad_norm": 15.71560575354875, + "learning_rate": 9.806756556933823e-06, + "loss": 1.1401, + "step": 3175 + }, + { + "epoch": 0.54, + "grad_norm": 10.457646473635371, + "learning_rate": 9.805392582134888e-06, + "loss": 1.0976, + "step": 3180 + }, + { + "epoch": 0.54, + "grad_norm": 12.021716252996788, + "learning_rate": 9.804023906090779e-06, + "loss": 1.1171, + "step": 3185 + }, + { + "epoch": 0.54, + "grad_norm": 36.17543609821767, + "learning_rate": 9.802650530140503e-06, + "loss": 1.1359, + "step": 3190 + }, + { + "epoch": 0.54, + "grad_norm": 14.174434300819575, + "learning_rate": 9.801272455627678e-06, + "loss": 1.1354, + "step": 3195 + }, + { + "epoch": 0.54, + "grad_norm": 17.968977941832232, + "learning_rate": 9.799889683900506e-06, + "loss": 1.082, + "step": 3200 + }, + { + "epoch": 0.54, + "grad_norm": 12.179685514337823, + "learning_rate": 9.798502216311791e-06, + "loss": 1.1101, + "step": 3205 + }, + { + "epoch": 0.55, + "grad_norm": 28.261361854490232, + "learning_rate": 9.797110054218932e-06, + "loss": 1.1267, + "step": 3210 + }, + { + "epoch": 0.55, + "grad_norm": 24.91609567279778, + "learning_rate": 9.795713198983918e-06, + "loss": 1.1207, + "step": 3215 + }, + { + "epoch": 0.55, + "grad_norm": 19.450449096827676, + "learning_rate": 9.794311651973329e-06, + "loss": 1.1189, + "step": 3220 + }, + { + "epoch": 0.55, + "grad_norm": 34.838540645917654, + "learning_rate": 9.792905414558336e-06, + "loss": 1.1238, + "step": 3225 + }, + { + "epoch": 0.55, + "grad_norm": 49.79019197252163, + "learning_rate": 9.7914944881147e-06, + "loss": 1.1031, + "step": 3230 + }, + { + "epoch": 0.55, + "grad_norm": 22.569223081112888, + "learning_rate": 9.790078874022767e-06, + "loss": 1.103, + "step": 3235 + }, + { + "epoch": 0.55, + "grad_norm": 34.088080046583904, + "learning_rate": 9.788658573667475e-06, + "loss": 1.1132, + "step": 3240 + }, + { + "epoch": 0.55, + "grad_norm": 110.16064646913127, + "learning_rate": 9.787233588438336e-06, + "loss": 1.1, + "step": 3245 + }, + { + "epoch": 0.55, + "grad_norm": 40.23449989378385, + "learning_rate": 9.785803919729455e-06, + "loss": 1.1135, + "step": 3250 + }, + { + "epoch": 0.55, + "grad_norm": 44.46605455243022, + "learning_rate": 9.784369568939516e-06, + "loss": 1.1247, + "step": 3255 + }, + { + "epoch": 0.55, + "grad_norm": 21.131004713406863, + "learning_rate": 9.78293053747178e-06, + "loss": 1.1123, + "step": 3260 + }, + { + "epoch": 0.56, + "grad_norm": 21.71929947964882, + "learning_rate": 9.781486826734092e-06, + "loss": 1.0865, + "step": 3265 + }, + { + "epoch": 0.56, + "grad_norm": 42.99587715186835, + "learning_rate": 9.780038438138875e-06, + "loss": 1.1011, + "step": 3270 + }, + { + "epoch": 0.56, + "grad_norm": 39.95133149205219, + "learning_rate": 9.778585373103123e-06, + "loss": 1.0921, + "step": 3275 + }, + { + "epoch": 0.56, + "grad_norm": 26.10809015145351, + "learning_rate": 9.777127633048412e-06, + "loss": 1.1019, + "step": 3280 + }, + { + "epoch": 0.56, + "grad_norm": 34.02220191762274, + "learning_rate": 9.775665219400884e-06, + "loss": 1.1112, + "step": 3285 + }, + { + "epoch": 0.56, + "grad_norm": 12.365701088432422, + "learning_rate": 9.774198133591263e-06, + "loss": 1.0958, + "step": 3290 + }, + { + "epoch": 0.56, + "grad_norm": 10.289864979863733, + "learning_rate": 9.772726377054838e-06, + "loss": 1.0848, + "step": 3295 + }, + { + "epoch": 0.56, + "grad_norm": 14.09533795089284, + "learning_rate": 9.771249951231465e-06, + "loss": 1.1023, + "step": 3300 + }, + { + "epoch": 0.56, + "grad_norm": 33.23702455667827, + "learning_rate": 9.769768857565573e-06, + "loss": 1.1026, + "step": 3305 + }, + { + "epoch": 0.56, + "grad_norm": 10.308081889604273, + "learning_rate": 9.768283097506155e-06, + "loss": 1.079, + "step": 3310 + }, + { + "epoch": 0.56, + "grad_norm": 24.694123563015506, + "learning_rate": 9.766792672506771e-06, + "loss": 1.0742, + "step": 3315 + }, + { + "epoch": 0.56, + "grad_norm": 20.34539350546079, + "learning_rate": 9.765297584025542e-06, + "loss": 1.0914, + "step": 3320 + }, + { + "epoch": 0.57, + "grad_norm": 46.916385220387916, + "learning_rate": 9.763797833525157e-06, + "loss": 1.097, + "step": 3325 + }, + { + "epoch": 0.57, + "grad_norm": 10.347280588345283, + "learning_rate": 9.76229342247286e-06, + "loss": 1.0702, + "step": 3330 + }, + { + "epoch": 0.57, + "grad_norm": 8.490168781307537, + "learning_rate": 9.760784352340457e-06, + "loss": 1.1083, + "step": 3335 + }, + { + "epoch": 0.57, + "grad_norm": 35.84052955467092, + "learning_rate": 9.759270624604314e-06, + "loss": 1.0903, + "step": 3340 + }, + { + "epoch": 0.57, + "grad_norm": 30.14735937800149, + "learning_rate": 9.757752240745348e-06, + "loss": 1.0864, + "step": 3345 + }, + { + "epoch": 0.57, + "grad_norm": 14.482082933392752, + "learning_rate": 9.756229202249038e-06, + "loss": 1.1044, + "step": 3350 + }, + { + "epoch": 0.57, + "grad_norm": 24.69217569957398, + "learning_rate": 9.75470151060541e-06, + "loss": 1.0724, + "step": 3355 + }, + { + "epoch": 0.57, + "grad_norm": 11.182521065589473, + "learning_rate": 9.753169167309052e-06, + "loss": 1.1146, + "step": 3360 + }, + { + "epoch": 0.57, + "grad_norm": 11.069497891124406, + "learning_rate": 9.751632173859092e-06, + "loss": 1.0826, + "step": 3365 + }, + { + "epoch": 0.57, + "grad_norm": 10.510795081029487, + "learning_rate": 9.750090531759211e-06, + "loss": 1.0923, + "step": 3370 + }, + { + "epoch": 0.57, + "grad_norm": 25.081856809348373, + "learning_rate": 9.748544242517644e-06, + "loss": 1.0852, + "step": 3375 + }, + { + "epoch": 0.57, + "grad_norm": 16.06757949104813, + "learning_rate": 9.746993307647165e-06, + "loss": 1.1093, + "step": 3380 + }, + { + "epoch": 0.58, + "grad_norm": 35.25108551159975, + "learning_rate": 9.745437728665095e-06, + "loss": 1.0882, + "step": 3385 + }, + { + "epoch": 0.58, + "grad_norm": 10.884674188000995, + "learning_rate": 9.743877507093298e-06, + "loss": 1.0845, + "step": 3390 + }, + { + "epoch": 0.58, + "grad_norm": 10.939698638231848, + "learning_rate": 9.742312644458183e-06, + "loss": 1.1061, + "step": 3395 + }, + { + "epoch": 0.58, + "grad_norm": 19.83963911448851, + "learning_rate": 9.740743142290696e-06, + "loss": 1.0948, + "step": 3400 + }, + { + "epoch": 0.58, + "grad_norm": 16.542057789428704, + "learning_rate": 9.739169002126326e-06, + "loss": 1.0681, + "step": 3405 + }, + { + "epoch": 0.58, + "grad_norm": 12.85859530160301, + "learning_rate": 9.737590225505091e-06, + "loss": 1.0714, + "step": 3410 + }, + { + "epoch": 0.58, + "grad_norm": 10.529021625117974, + "learning_rate": 9.736006813971557e-06, + "loss": 1.083, + "step": 3415 + }, + { + "epoch": 0.58, + "grad_norm": 32.642327759447724, + "learning_rate": 9.734418769074813e-06, + "loss": 1.0949, + "step": 3420 + }, + { + "epoch": 0.58, + "grad_norm": 18.722861850410514, + "learning_rate": 9.732826092368491e-06, + "loss": 1.0958, + "step": 3425 + }, + { + "epoch": 0.58, + "grad_norm": 39.00659387481143, + "learning_rate": 9.731228785410746e-06, + "loss": 1.0861, + "step": 3430 + }, + { + "epoch": 0.58, + "grad_norm": 10.711037906612924, + "learning_rate": 9.729626849764266e-06, + "loss": 1.0772, + "step": 3435 + }, + { + "epoch": 0.58, + "grad_norm": 9.42121870550603, + "learning_rate": 9.728020286996273e-06, + "loss": 1.0815, + "step": 3440 + }, + { + "epoch": 0.59, + "grad_norm": 30.639900438665272, + "learning_rate": 9.726409098678505e-06, + "loss": 1.0801, + "step": 3445 + }, + { + "epoch": 0.59, + "grad_norm": 11.123471421452953, + "learning_rate": 9.724793286387233e-06, + "loss": 1.0938, + "step": 3450 + }, + { + "epoch": 0.59, + "grad_norm": 19.93670960977343, + "learning_rate": 9.72317285170325e-06, + "loss": 1.0521, + "step": 3455 + }, + { + "epoch": 0.59, + "grad_norm": 28.837291173282402, + "learning_rate": 9.721547796211872e-06, + "loss": 1.0937, + "step": 3460 + }, + { + "epoch": 0.59, + "grad_norm": 99.22043991359959, + "learning_rate": 9.719918121502933e-06, + "loss": 1.1099, + "step": 3465 + }, + { + "epoch": 0.59, + "grad_norm": 118.91259663380502, + "learning_rate": 9.718283829170788e-06, + "loss": 1.0882, + "step": 3470 + }, + { + "epoch": 0.59, + "grad_norm": 94.83197424147019, + "learning_rate": 9.71664492081431e-06, + "loss": 1.1115, + "step": 3475 + }, + { + "epoch": 0.59, + "grad_norm": 23.176334095455275, + "learning_rate": 9.715001398036884e-06, + "loss": 1.1077, + "step": 3480 + }, + { + "epoch": 0.59, + "grad_norm": 23.25526333684265, + "learning_rate": 9.713353262446419e-06, + "loss": 1.0704, + "step": 3485 + }, + { + "epoch": 0.59, + "grad_norm": 62.50190406613821, + "learning_rate": 9.711700515655327e-06, + "loss": 1.0833, + "step": 3490 + }, + { + "epoch": 0.59, + "grad_norm": 46.93771092847943, + "learning_rate": 9.710043159280532e-06, + "loss": 1.0825, + "step": 3495 + }, + { + "epoch": 0.6, + "grad_norm": 29.238170589766323, + "learning_rate": 9.708381194943476e-06, + "loss": 1.0879, + "step": 3500 + }, + { + "epoch": 0.6, + "grad_norm": 13.685350295706728, + "learning_rate": 9.706714624270097e-06, + "loss": 1.0783, + "step": 3505 + }, + { + "epoch": 0.6, + "grad_norm": 13.303197061444635, + "learning_rate": 9.705043448890852e-06, + "loss": 1.0803, + "step": 3510 + }, + { + "epoch": 0.6, + "grad_norm": 23.053480521822973, + "learning_rate": 9.703367670440695e-06, + "loss": 1.0666, + "step": 3515 + }, + { + "epoch": 0.6, + "grad_norm": 16.52258620177513, + "learning_rate": 9.701687290559084e-06, + "loss": 1.082, + "step": 3520 + }, + { + "epoch": 0.6, + "grad_norm": 22.692161284567145, + "learning_rate": 9.700002310889981e-06, + "loss": 1.0668, + "step": 3525 + }, + { + "epoch": 0.6, + "grad_norm": 20.73627310180072, + "learning_rate": 9.698312733081847e-06, + "loss": 1.0806, + "step": 3530 + }, + { + "epoch": 0.6, + "grad_norm": 31.383021093811415, + "learning_rate": 9.696618558787643e-06, + "loss": 1.0799, + "step": 3535 + }, + { + "epoch": 0.6, + "grad_norm": 35.7883766937957, + "learning_rate": 9.69491978966482e-06, + "loss": 1.0579, + "step": 3540 + }, + { + "epoch": 0.6, + "grad_norm": 22.439749721861627, + "learning_rate": 9.693216427375338e-06, + "loss": 1.055, + "step": 3545 + }, + { + "epoch": 0.6, + "grad_norm": 10.528624040661086, + "learning_rate": 9.691508473585635e-06, + "loss": 1.0866, + "step": 3550 + }, + { + "epoch": 0.6, + "grad_norm": 14.191808567072622, + "learning_rate": 9.689795929966653e-06, + "loss": 1.0625, + "step": 3555 + }, + { + "epoch": 0.61, + "grad_norm": 49.35630024440896, + "learning_rate": 9.688078798193816e-06, + "loss": 1.0858, + "step": 3560 + }, + { + "epoch": 0.61, + "grad_norm": 141.8741033629112, + "learning_rate": 9.686357079947042e-06, + "loss": 1.0819, + "step": 3565 + }, + { + "epoch": 0.61, + "grad_norm": 101.01456442603427, + "learning_rate": 9.684630776910734e-06, + "loss": 1.0799, + "step": 3570 + }, + { + "epoch": 0.61, + "grad_norm": 48.44038091165258, + "learning_rate": 9.682899890773782e-06, + "loss": 1.0784, + "step": 3575 + }, + { + "epoch": 0.61, + "grad_norm": 39.09727281088586, + "learning_rate": 9.68116442322956e-06, + "loss": 1.094, + "step": 3580 + }, + { + "epoch": 0.61, + "grad_norm": 58.65427281326046, + "learning_rate": 9.679424375975916e-06, + "loss": 1.0738, + "step": 3585 + }, + { + "epoch": 0.61, + "grad_norm": 14.717354874808702, + "learning_rate": 9.677679750715194e-06, + "loss": 1.0706, + "step": 3590 + }, + { + "epoch": 0.61, + "grad_norm": 12.32608732711157, + "learning_rate": 9.675930549154201e-06, + "loss": 1.0567, + "step": 3595 + }, + { + "epoch": 0.61, + "grad_norm": 20.979587808667443, + "learning_rate": 9.674176773004232e-06, + "loss": 1.0738, + "step": 3600 + }, + { + "epoch": 0.61, + "grad_norm": 19.984425443283612, + "learning_rate": 9.672418423981051e-06, + "loss": 1.0816, + "step": 3605 + }, + { + "epoch": 0.61, + "grad_norm": 15.322899089745414, + "learning_rate": 9.6706555038049e-06, + "loss": 1.0641, + "step": 3610 + }, + { + "epoch": 0.61, + "grad_norm": 14.27669372683442, + "learning_rate": 9.66888801420049e-06, + "loss": 1.0529, + "step": 3615 + }, + { + "epoch": 0.62, + "grad_norm": 26.73787927731739, + "learning_rate": 9.667115956897007e-06, + "loss": 1.0565, + "step": 3620 + }, + { + "epoch": 0.62, + "grad_norm": 14.332940164863567, + "learning_rate": 9.6653393336281e-06, + "loss": 1.0608, + "step": 3625 + }, + { + "epoch": 0.62, + "grad_norm": 17.957432148595675, + "learning_rate": 9.663558146131886e-06, + "loss": 1.062, + "step": 3630 + }, + { + "epoch": 0.62, + "grad_norm": 17.524600244030466, + "learning_rate": 9.66177239615095e-06, + "loss": 1.0416, + "step": 3635 + }, + { + "epoch": 0.62, + "grad_norm": 33.43797866232677, + "learning_rate": 9.65998208543234e-06, + "loss": 1.0665, + "step": 3640 + }, + { + "epoch": 0.62, + "grad_norm": 28.187859604767837, + "learning_rate": 9.658187215727567e-06, + "loss": 1.0512, + "step": 3645 + }, + { + "epoch": 0.62, + "grad_norm": 43.17834437252287, + "learning_rate": 9.656387788792594e-06, + "loss": 1.0667, + "step": 3650 + }, + { + "epoch": 0.62, + "grad_norm": 16.833570997925662, + "learning_rate": 9.654583806387855e-06, + "loss": 1.0474, + "step": 3655 + }, + { + "epoch": 0.62, + "grad_norm": 19.4936094387291, + "learning_rate": 9.652775270278236e-06, + "loss": 1.0629, + "step": 3660 + }, + { + "epoch": 0.62, + "grad_norm": 12.406721518276477, + "learning_rate": 9.65096218223307e-06, + "loss": 1.0415, + "step": 3665 + }, + { + "epoch": 0.62, + "grad_norm": 14.348020212680531, + "learning_rate": 9.649144544026154e-06, + "loss": 1.0609, + "step": 3670 + }, + { + "epoch": 0.62, + "grad_norm": 25.662165096305973, + "learning_rate": 9.647322357435734e-06, + "loss": 1.0547, + "step": 3675 + }, + { + "epoch": 0.63, + "grad_norm": 62.94870309796483, + "learning_rate": 9.6454956242445e-06, + "loss": 1.0514, + "step": 3680 + }, + { + "epoch": 0.63, + "grad_norm": 33.18978079391046, + "learning_rate": 9.643664346239598e-06, + "loss": 1.0716, + "step": 3685 + }, + { + "epoch": 0.63, + "grad_norm": 34.21728007762453, + "learning_rate": 9.641828525212616e-06, + "loss": 1.0459, + "step": 3690 + }, + { + "epoch": 0.63, + "grad_norm": 19.398489033262475, + "learning_rate": 9.639988162959586e-06, + "loss": 1.0614, + "step": 3695 + }, + { + "epoch": 0.63, + "grad_norm": 35.381394002160434, + "learning_rate": 9.638143261280983e-06, + "loss": 1.0515, + "step": 3700 + }, + { + "epoch": 0.63, + "grad_norm": 52.88442016004509, + "learning_rate": 9.636293821981728e-06, + "loss": 1.0715, + "step": 3705 + }, + { + "epoch": 0.63, + "grad_norm": 60.17940969814508, + "learning_rate": 9.634439846871173e-06, + "loss": 1.0605, + "step": 3710 + }, + { + "epoch": 0.63, + "grad_norm": 42.01453778390265, + "learning_rate": 9.632581337763115e-06, + "loss": 1.0632, + "step": 3715 + }, + { + "epoch": 0.63, + "grad_norm": 16.91412969693914, + "learning_rate": 9.630718296475782e-06, + "loss": 1.0546, + "step": 3720 + }, + { + "epoch": 0.63, + "grad_norm": 9.122096671164469, + "learning_rate": 9.628850724831838e-06, + "loss": 1.0404, + "step": 3725 + }, + { + "epoch": 0.63, + "grad_norm": 10.360961065245824, + "learning_rate": 9.626978624658378e-06, + "loss": 1.0499, + "step": 3730 + }, + { + "epoch": 0.63, + "grad_norm": 20.681549559872614, + "learning_rate": 9.625101997786929e-06, + "loss": 1.0515, + "step": 3735 + }, + { + "epoch": 0.64, + "grad_norm": 34.311939706830195, + "learning_rate": 9.623220846053448e-06, + "loss": 1.0422, + "step": 3740 + }, + { + "epoch": 0.64, + "grad_norm": 9.754605131977751, + "learning_rate": 9.621335171298312e-06, + "loss": 1.0469, + "step": 3745 + }, + { + "epoch": 0.64, + "grad_norm": 17.552801959273697, + "learning_rate": 9.619444975366328e-06, + "loss": 1.0359, + "step": 3750 + }, + { + "epoch": 0.64, + "grad_norm": 16.2524305965504, + "learning_rate": 9.617550260106729e-06, + "loss": 1.0403, + "step": 3755 + }, + { + "epoch": 0.64, + "grad_norm": 11.093338843745048, + "learning_rate": 9.615651027373163e-06, + "loss": 1.0454, + "step": 3760 + }, + { + "epoch": 0.64, + "grad_norm": 10.439354245341704, + "learning_rate": 9.613747279023704e-06, + "loss": 1.0597, + "step": 3765 + }, + { + "epoch": 0.64, + "grad_norm": 34.420764529314724, + "learning_rate": 9.611839016920837e-06, + "loss": 1.034, + "step": 3770 + }, + { + "epoch": 0.64, + "grad_norm": 9.15542917484291, + "learning_rate": 9.609926242931467e-06, + "loss": 1.0317, + "step": 3775 + }, + { + "epoch": 0.64, + "grad_norm": 20.92806883061577, + "learning_rate": 9.608008958926911e-06, + "loss": 1.0545, + "step": 3780 + }, + { + "epoch": 0.64, + "grad_norm": 20.937194797150127, + "learning_rate": 9.606087166782904e-06, + "loss": 1.0473, + "step": 3785 + }, + { + "epoch": 0.64, + "grad_norm": 9.847890544358966, + "learning_rate": 9.604160868379584e-06, + "loss": 1.0362, + "step": 3790 + }, + { + "epoch": 0.65, + "grad_norm": 61.39464963918651, + "learning_rate": 9.6022300656015e-06, + "loss": 1.0238, + "step": 3795 + }, + { + "epoch": 0.65, + "grad_norm": 101.58609678822324, + "learning_rate": 9.600294760337611e-06, + "loss": 1.0628, + "step": 3800 + }, + { + "epoch": 0.65, + "grad_norm": 48.990707631127584, + "learning_rate": 9.598354954481276e-06, + "loss": 1.0538, + "step": 3805 + }, + { + "epoch": 0.65, + "grad_norm": 17.648537571948292, + "learning_rate": 9.596410649930262e-06, + "loss": 1.046, + "step": 3810 + }, + { + "epoch": 0.65, + "grad_norm": 21.385685025543232, + "learning_rate": 9.59446184858673e-06, + "loss": 1.0317, + "step": 3815 + }, + { + "epoch": 0.65, + "grad_norm": 27.822628216288326, + "learning_rate": 9.592508552357251e-06, + "loss": 1.0599, + "step": 3820 + }, + { + "epoch": 0.65, + "grad_norm": 43.65608973671009, + "learning_rate": 9.590550763152781e-06, + "loss": 1.0216, + "step": 3825 + }, + { + "epoch": 0.65, + "grad_norm": 68.66082796487945, + "learning_rate": 9.588588482888684e-06, + "loss": 1.0369, + "step": 3830 + }, + { + "epoch": 0.65, + "grad_norm": 44.87563845739073, + "learning_rate": 9.586621713484708e-06, + "loss": 1.0447, + "step": 3835 + }, + { + "epoch": 0.65, + "grad_norm": 85.29718189990733, + "learning_rate": 9.584650456864997e-06, + "loss": 1.0345, + "step": 3840 + }, + { + "epoch": 0.65, + "grad_norm": 41.63515269827128, + "learning_rate": 9.582674714958088e-06, + "loss": 1.0539, + "step": 3845 + }, + { + "epoch": 0.65, + "grad_norm": 44.2242489976644, + "learning_rate": 9.580694489696896e-06, + "loss": 1.0205, + "step": 3850 + }, + { + "epoch": 0.66, + "grad_norm": 36.041665972339715, + "learning_rate": 9.578709783018734e-06, + "loss": 1.0488, + "step": 3855 + }, + { + "epoch": 0.66, + "grad_norm": 67.57874599444452, + "learning_rate": 9.576720596865292e-06, + "loss": 1.0363, + "step": 3860 + }, + { + "epoch": 0.66, + "grad_norm": 38.21695327694072, + "learning_rate": 9.574726933182645e-06, + "loss": 1.0391, + "step": 3865 + }, + { + "epoch": 0.66, + "grad_norm": 13.661349716058288, + "learning_rate": 9.572728793921248e-06, + "loss": 1.0213, + "step": 3870 + }, + { + "epoch": 0.66, + "grad_norm": 12.13353362747518, + "learning_rate": 9.570726181035934e-06, + "loss": 1.0435, + "step": 3875 + }, + { + "epoch": 0.66, + "grad_norm": 11.334770151580553, + "learning_rate": 9.568719096485915e-06, + "loss": 1.0535, + "step": 3880 + }, + { + "epoch": 0.66, + "grad_norm": 13.117262631880102, + "learning_rate": 9.566707542234774e-06, + "loss": 1.0373, + "step": 3885 + }, + { + "epoch": 0.66, + "grad_norm": 10.325426340360542, + "learning_rate": 9.56469152025047e-06, + "loss": 1.02, + "step": 3890 + }, + { + "epoch": 0.66, + "grad_norm": 27.760103500384453, + "learning_rate": 9.562671032505328e-06, + "loss": 1.0613, + "step": 3895 + }, + { + "epoch": 0.66, + "grad_norm": 19.677606688797173, + "learning_rate": 9.560646080976052e-06, + "loss": 1.0374, + "step": 3900 + }, + { + "epoch": 0.66, + "grad_norm": 12.221222305008846, + "learning_rate": 9.558616667643703e-06, + "loss": 1.0383, + "step": 3905 + }, + { + "epoch": 0.66, + "grad_norm": 11.05047992856828, + "learning_rate": 9.55658279449371e-06, + "loss": 1.0332, + "step": 3910 + }, + { + "epoch": 0.67, + "grad_norm": 8.783299101241285, + "learning_rate": 9.554544463515867e-06, + "loss": 1.0334, + "step": 3915 + }, + { + "epoch": 0.67, + "grad_norm": 14.469272814234113, + "learning_rate": 9.552501676704328e-06, + "loss": 1.0243, + "step": 3920 + }, + { + "epoch": 0.67, + "grad_norm": 11.437389694030383, + "learning_rate": 9.55045443605761e-06, + "loss": 1.042, + "step": 3925 + }, + { + "epoch": 0.67, + "grad_norm": 15.818804679626847, + "learning_rate": 9.548402743578578e-06, + "loss": 1.0106, + "step": 3930 + }, + { + "epoch": 0.67, + "grad_norm": 72.12551959106173, + "learning_rate": 9.54634660127446e-06, + "loss": 1.0335, + "step": 3935 + }, + { + "epoch": 0.67, + "grad_norm": 60.35525493588452, + "learning_rate": 9.544286011156838e-06, + "loss": 1.0206, + "step": 3940 + }, + { + "epoch": 0.67, + "grad_norm": 28.517071830491236, + "learning_rate": 9.542220975241641e-06, + "loss": 1.027, + "step": 3945 + }, + { + "epoch": 0.67, + "grad_norm": 14.41421474517318, + "learning_rate": 9.540151495549148e-06, + "loss": 1.0294, + "step": 3950 + }, + { + "epoch": 0.67, + "grad_norm": 10.95708153303163, + "learning_rate": 9.538077574103988e-06, + "loss": 1.0358, + "step": 3955 + }, + { + "epoch": 0.67, + "grad_norm": 21.632102740651966, + "learning_rate": 9.535999212935135e-06, + "loss": 1.0211, + "step": 3960 + }, + { + "epoch": 0.67, + "grad_norm": 13.86693762831278, + "learning_rate": 9.533916414075906e-06, + "loss": 1.0252, + "step": 3965 + }, + { + "epoch": 0.67, + "grad_norm": 21.262197447013794, + "learning_rate": 9.531829179563958e-06, + "loss": 1.039, + "step": 3970 + }, + { + "epoch": 0.68, + "grad_norm": 34.54442719901615, + "learning_rate": 9.529737511441288e-06, + "loss": 1.0335, + "step": 3975 + }, + { + "epoch": 0.68, + "grad_norm": 63.2216943225613, + "learning_rate": 9.527641411754234e-06, + "loss": 1.0274, + "step": 3980 + }, + { + "epoch": 0.68, + "grad_norm": 48.38593781451802, + "learning_rate": 9.525540882553465e-06, + "loss": 1.0166, + "step": 3985 + }, + { + "epoch": 0.68, + "grad_norm": 45.643379369645665, + "learning_rate": 9.523435925893986e-06, + "loss": 1.0045, + "step": 3990 + }, + { + "epoch": 0.68, + "grad_norm": 24.825075531481474, + "learning_rate": 9.521326543835135e-06, + "loss": 1.0216, + "step": 3995 + }, + { + "epoch": 0.68, + "grad_norm": 32.30923946943804, + "learning_rate": 9.519212738440572e-06, + "loss": 1.0252, + "step": 4000 + }, + { + "epoch": 0.68, + "grad_norm": 31.268891186427773, + "learning_rate": 9.517094511778294e-06, + "loss": 0.9941, + "step": 4005 + }, + { + "epoch": 0.68, + "grad_norm": 19.506914693015624, + "learning_rate": 9.514971865920618e-06, + "loss": 1.0087, + "step": 4010 + }, + { + "epoch": 0.68, + "grad_norm": 16.515776436928725, + "learning_rate": 9.512844802944186e-06, + "loss": 1.0195, + "step": 4015 + }, + { + "epoch": 0.68, + "grad_norm": 15.148277998784094, + "learning_rate": 9.510713324929962e-06, + "loss": 1.0374, + "step": 4020 + }, + { + "epoch": 0.68, + "grad_norm": 12.235328265558776, + "learning_rate": 9.508577433963227e-06, + "loss": 1.0438, + "step": 4025 + }, + { + "epoch": 0.69, + "grad_norm": 17.09979546945935, + "learning_rate": 9.506437132133581e-06, + "loss": 1.0227, + "step": 4030 + }, + { + "epoch": 0.69, + "grad_norm": 8.238589281663184, + "learning_rate": 9.50429242153494e-06, + "loss": 1.0176, + "step": 4035 + }, + { + "epoch": 0.69, + "grad_norm": 10.311313851556408, + "learning_rate": 9.50214330426553e-06, + "loss": 1.0284, + "step": 4040 + }, + { + "epoch": 0.69, + "grad_norm": 12.345687173293602, + "learning_rate": 9.499989782427893e-06, + "loss": 1.0142, + "step": 4045 + }, + { + "epoch": 0.69, + "grad_norm": 10.004499727601269, + "learning_rate": 9.497831858128876e-06, + "loss": 1.0229, + "step": 4050 + }, + { + "epoch": 0.69, + "grad_norm": 14.725097828165937, + "learning_rate": 9.495669533479634e-06, + "loss": 1.0155, + "step": 4055 + }, + { + "epoch": 0.69, + "grad_norm": 14.228255051625794, + "learning_rate": 9.49350281059563e-06, + "loss": 1.0027, + "step": 4060 + }, + { + "epoch": 0.69, + "grad_norm": 15.82098769307729, + "learning_rate": 9.491331691596625e-06, + "loss": 1.0003, + "step": 4065 + }, + { + "epoch": 0.69, + "grad_norm": 20.029534817911642, + "learning_rate": 9.489156178606684e-06, + "loss": 1.0191, + "step": 4070 + }, + { + "epoch": 0.69, + "grad_norm": 14.096382014592697, + "learning_rate": 9.48697627375417e-06, + "loss": 1.0358, + "step": 4075 + }, + { + "epoch": 0.69, + "grad_norm": 11.164168759887309, + "learning_rate": 9.484791979171744e-06, + "loss": 1.0141, + "step": 4080 + }, + { + "epoch": 0.69, + "grad_norm": 13.03131118489538, + "learning_rate": 9.482603296996358e-06, + "loss": 1.0161, + "step": 4085 + }, + { + "epoch": 0.7, + "grad_norm": 20.75518249609745, + "learning_rate": 9.48041022936926e-06, + "loss": 1.0082, + "step": 4090 + }, + { + "epoch": 0.7, + "grad_norm": 18.349151443252808, + "learning_rate": 9.478212778435987e-06, + "loss": 1.0038, + "step": 4095 + }, + { + "epoch": 0.7, + "grad_norm": 11.74009872478776, + "learning_rate": 9.476010946346365e-06, + "loss": 1.0176, + "step": 4100 + }, + { + "epoch": 0.7, + "grad_norm": 13.412538009143013, + "learning_rate": 9.473804735254507e-06, + "loss": 1.0209, + "step": 4105 + }, + { + "epoch": 0.7, + "grad_norm": 55.590697499654354, + "learning_rate": 9.471594147318806e-06, + "loss": 1.0235, + "step": 4110 + }, + { + "epoch": 0.7, + "grad_norm": 20.395513398092206, + "learning_rate": 9.469379184701942e-06, + "loss": 1.0252, + "step": 4115 + }, + { + "epoch": 0.7, + "grad_norm": 82.57676781630296, + "learning_rate": 9.46715984957087e-06, + "loss": 1.0101, + "step": 4120 + }, + { + "epoch": 0.7, + "grad_norm": 109.30761353748431, + "learning_rate": 9.464936144096828e-06, + "loss": 1.0, + "step": 4125 + }, + { + "epoch": 0.7, + "grad_norm": 53.283011866055276, + "learning_rate": 9.462708070455327e-06, + "loss": 1.0277, + "step": 4130 + }, + { + "epoch": 0.7, + "grad_norm": 53.341823780234854, + "learning_rate": 9.46047563082615e-06, + "loss": 1.0237, + "step": 4135 + }, + { + "epoch": 0.7, + "grad_norm": 18.15169787672353, + "learning_rate": 9.458238827393353e-06, + "loss": 1.0182, + "step": 4140 + }, + { + "epoch": 0.7, + "grad_norm": 9.281494842923637, + "learning_rate": 9.455997662345262e-06, + "loss": 0.9904, + "step": 4145 + }, + { + "epoch": 0.71, + "grad_norm": 12.863337348739847, + "learning_rate": 9.45375213787447e-06, + "loss": 1.0252, + "step": 4150 + }, + { + "epoch": 0.71, + "grad_norm": 19.464852452342406, + "learning_rate": 9.451502256177832e-06, + "loss": 1.008, + "step": 4155 + }, + { + "epoch": 0.71, + "grad_norm": 23.836143947796305, + "learning_rate": 9.44924801945647e-06, + "loss": 1.0041, + "step": 4160 + }, + { + "epoch": 0.71, + "grad_norm": 21.083807605337846, + "learning_rate": 9.446989429915763e-06, + "loss": 1.0062, + "step": 4165 + }, + { + "epoch": 0.71, + "grad_norm": 20.37749053282835, + "learning_rate": 9.44472648976535e-06, + "loss": 1.0121, + "step": 4170 + }, + { + "epoch": 0.71, + "grad_norm": 15.873447859539434, + "learning_rate": 9.442459201219127e-06, + "loss": 1.0051, + "step": 4175 + }, + { + "epoch": 0.71, + "grad_norm": 18.959460649769564, + "learning_rate": 9.440187566495246e-06, + "loss": 1.004, + "step": 4180 + }, + { + "epoch": 0.71, + "grad_norm": 27.467828324088764, + "learning_rate": 9.437911587816105e-06, + "loss": 0.9968, + "step": 4185 + }, + { + "epoch": 0.71, + "grad_norm": 43.20261939532219, + "learning_rate": 9.435631267408355e-06, + "loss": 1.0115, + "step": 4190 + }, + { + "epoch": 0.71, + "grad_norm": 26.800565504607206, + "learning_rate": 9.4333466075029e-06, + "loss": 1.0122, + "step": 4195 + }, + { + "epoch": 0.71, + "grad_norm": 9.94300587635886, + "learning_rate": 9.431057610334878e-06, + "loss": 1.0043, + "step": 4200 + }, + { + "epoch": 0.71, + "grad_norm": 10.614855349141937, + "learning_rate": 9.42876427814368e-06, + "loss": 1.0002, + "step": 4205 + }, + { + "epoch": 0.72, + "grad_norm": 11.144998257948764, + "learning_rate": 9.426466613172935e-06, + "loss": 0.9941, + "step": 4210 + }, + { + "epoch": 0.72, + "grad_norm": 9.103838790198429, + "learning_rate": 9.42416461767051e-06, + "loss": 0.9967, + "step": 4215 + }, + { + "epoch": 0.72, + "grad_norm": 11.103071437327314, + "learning_rate": 9.421858293888509e-06, + "loss": 0.9872, + "step": 4220 + }, + { + "epoch": 0.72, + "grad_norm": 17.4543054300558, + "learning_rate": 9.41954764408327e-06, + "loss": 1.0034, + "step": 4225 + }, + { + "epoch": 0.72, + "grad_norm": 29.419165904681627, + "learning_rate": 9.417232670515367e-06, + "loss": 0.9876, + "step": 4230 + }, + { + "epoch": 0.72, + "grad_norm": 24.500606114143153, + "learning_rate": 9.414913375449598e-06, + "loss": 1.004, + "step": 4235 + }, + { + "epoch": 0.72, + "grad_norm": 20.066193807337648, + "learning_rate": 9.41258976115499e-06, + "loss": 1.0127, + "step": 4240 + }, + { + "epoch": 0.72, + "grad_norm": 52.031867154118224, + "learning_rate": 9.410261829904805e-06, + "loss": 1.0046, + "step": 4245 + }, + { + "epoch": 0.72, + "grad_norm": 21.157296869356358, + "learning_rate": 9.407929583976514e-06, + "loss": 1.0206, + "step": 4250 + }, + { + "epoch": 0.72, + "grad_norm": 7.959852530800298, + "learning_rate": 9.40559302565182e-06, + "loss": 0.9852, + "step": 4255 + }, + { + "epoch": 0.72, + "grad_norm": 18.55092856854972, + "learning_rate": 9.40325215721664e-06, + "loss": 1.0152, + "step": 4260 + }, + { + "epoch": 0.73, + "grad_norm": 10.525060393654432, + "learning_rate": 9.400906980961106e-06, + "loss": 1.004, + "step": 4265 + }, + { + "epoch": 0.73, + "grad_norm": 9.212100273291716, + "learning_rate": 9.398557499179573e-06, + "loss": 0.9973, + "step": 4270 + }, + { + "epoch": 0.73, + "grad_norm": 22.460965399961488, + "learning_rate": 9.396203714170595e-06, + "loss": 0.996, + "step": 4275 + }, + { + "epoch": 0.73, + "grad_norm": 14.39405900296308, + "learning_rate": 9.393845628236949e-06, + "loss": 1.007, + "step": 4280 + }, + { + "epoch": 0.73, + "grad_norm": 51.01601071441724, + "learning_rate": 9.391483243685612e-06, + "loss": 0.9842, + "step": 4285 + }, + { + "epoch": 0.73, + "grad_norm": 56.88915495340164, + "learning_rate": 9.38911656282777e-06, + "loss": 1.0078, + "step": 4290 + }, + { + "epoch": 0.73, + "grad_norm": 65.33444023898811, + "learning_rate": 9.386745587978809e-06, + "loss": 0.9993, + "step": 4295 + }, + { + "epoch": 0.73, + "grad_norm": 35.49556367595794, + "learning_rate": 9.384370321458318e-06, + "loss": 0.986, + "step": 4300 + }, + { + "epoch": 0.73, + "grad_norm": 33.949914291057055, + "learning_rate": 9.381990765590086e-06, + "loss": 0.9923, + "step": 4305 + }, + { + "epoch": 0.73, + "grad_norm": 29.173102258533216, + "learning_rate": 9.379606922702092e-06, + "loss": 0.9875, + "step": 4310 + }, + { + "epoch": 0.73, + "grad_norm": 8.288434941882375, + "learning_rate": 9.377218795126519e-06, + "loss": 1.0079, + "step": 4315 + }, + { + "epoch": 0.73, + "grad_norm": 19.693191672912995, + "learning_rate": 9.374826385199735e-06, + "loss": 0.9909, + "step": 4320 + }, + { + "epoch": 0.74, + "grad_norm": 15.090041449361541, + "learning_rate": 9.372429695262297e-06, + "loss": 0.9855, + "step": 4325 + }, + { + "epoch": 0.74, + "grad_norm": 10.317039626693504, + "learning_rate": 9.370028727658956e-06, + "loss": 0.9726, + "step": 4330 + }, + { + "epoch": 0.74, + "grad_norm": 8.737797784798412, + "learning_rate": 9.367623484738639e-06, + "loss": 0.9939, + "step": 4335 + }, + { + "epoch": 0.74, + "grad_norm": 22.78204210712204, + "learning_rate": 9.365213968854463e-06, + "loss": 0.9837, + "step": 4340 + }, + { + "epoch": 0.74, + "grad_norm": 72.390320020571, + "learning_rate": 9.362800182363718e-06, + "loss": 0.9886, + "step": 4345 + }, + { + "epoch": 0.74, + "grad_norm": 57.5047526181269, + "learning_rate": 9.36038212762788e-06, + "loss": 1.0029, + "step": 4350 + }, + { + "epoch": 0.74, + "grad_norm": 101.10985440634894, + "learning_rate": 9.3579598070126e-06, + "loss": 0.9989, + "step": 4355 + }, + { + "epoch": 0.74, + "grad_norm": 72.40360809441891, + "learning_rate": 9.355533222887693e-06, + "loss": 1.0273, + "step": 4360 + }, + { + "epoch": 0.74, + "grad_norm": 23.804178434830444, + "learning_rate": 9.353102377627155e-06, + "loss": 0.9941, + "step": 4365 + }, + { + "epoch": 0.74, + "grad_norm": 96.09502523304418, + "learning_rate": 9.350667273609148e-06, + "loss": 1.0098, + "step": 4370 + }, + { + "epoch": 0.74, + "grad_norm": 75.77778989570355, + "learning_rate": 9.348227913216e-06, + "loss": 0.9983, + "step": 4375 + }, + { + "epoch": 0.74, + "grad_norm": 18.44568445884761, + "learning_rate": 9.345784298834202e-06, + "loss": 0.9828, + "step": 4380 + }, + { + "epoch": 0.75, + "grad_norm": 41.25631175653349, + "learning_rate": 9.343336432854408e-06, + "loss": 0.9831, + "step": 4385 + }, + { + "epoch": 0.75, + "grad_norm": 35.698092682203594, + "learning_rate": 9.340884317671432e-06, + "loss": 0.998, + "step": 4390 + }, + { + "epoch": 0.75, + "grad_norm": 32.49391162293078, + "learning_rate": 9.338427955684243e-06, + "loss": 0.9806, + "step": 4395 + }, + { + "epoch": 0.75, + "grad_norm": 10.241743309089527, + "learning_rate": 9.335967349295967e-06, + "loss": 0.9885, + "step": 4400 + }, + { + "epoch": 0.75, + "grad_norm": 32.37182387434173, + "learning_rate": 9.333502500913882e-06, + "loss": 0.9873, + "step": 4405 + }, + { + "epoch": 0.75, + "grad_norm": 30.142157717062506, + "learning_rate": 9.331033412949417e-06, + "loss": 1.0035, + "step": 4410 + }, + { + "epoch": 0.75, + "grad_norm": 12.9808930058223, + "learning_rate": 9.328560087818143e-06, + "loss": 0.9826, + "step": 4415 + }, + { + "epoch": 0.75, + "grad_norm": 10.087635510049255, + "learning_rate": 9.326082527939786e-06, + "loss": 0.9674, + "step": 4420 + }, + { + "epoch": 0.75, + "grad_norm": 13.901907099337802, + "learning_rate": 9.323600735738207e-06, + "loss": 0.9783, + "step": 4425 + }, + { + "epoch": 0.75, + "grad_norm": 22.503501555175596, + "learning_rate": 9.321114713641409e-06, + "loss": 0.9824, + "step": 4430 + }, + { + "epoch": 0.75, + "grad_norm": 15.802700822262079, + "learning_rate": 9.318624464081535e-06, + "loss": 0.9759, + "step": 4435 + }, + { + "epoch": 0.75, + "grad_norm": 12.704909570321815, + "learning_rate": 9.316129989494866e-06, + "loss": 0.9643, + "step": 4440 + }, + { + "epoch": 0.76, + "grad_norm": 9.51245114231154, + "learning_rate": 9.313631292321812e-06, + "loss": 0.9793, + "step": 4445 + }, + { + "epoch": 0.76, + "grad_norm": 9.31093610700748, + "learning_rate": 9.311128375006915e-06, + "loss": 1.0008, + "step": 4450 + }, + { + "epoch": 0.76, + "grad_norm": 13.48618925404952, + "learning_rate": 9.308621239998847e-06, + "loss": 0.9894, + "step": 4455 + }, + { + "epoch": 0.76, + "grad_norm": 10.277491750024144, + "learning_rate": 9.306109889750405e-06, + "loss": 0.987, + "step": 4460 + }, + { + "epoch": 0.76, + "grad_norm": 13.651178877768626, + "learning_rate": 9.303594326718514e-06, + "loss": 0.9691, + "step": 4465 + }, + { + "epoch": 0.76, + "grad_norm": 22.96171498854302, + "learning_rate": 9.301074553364214e-06, + "loss": 0.9568, + "step": 4470 + }, + { + "epoch": 0.76, + "grad_norm": 21.281745464081407, + "learning_rate": 9.29855057215267e-06, + "loss": 0.9785, + "step": 4475 + }, + { + "epoch": 0.76, + "grad_norm": 45.786548050160974, + "learning_rate": 9.296022385553156e-06, + "loss": 0.9658, + "step": 4480 + }, + { + "epoch": 0.76, + "grad_norm": 28.449037927202944, + "learning_rate": 9.293489996039068e-06, + "loss": 0.9571, + "step": 4485 + }, + { + "epoch": 0.76, + "grad_norm": 25.5063776566906, + "learning_rate": 9.290953406087913e-06, + "loss": 0.9924, + "step": 4490 + }, + { + "epoch": 0.76, + "grad_norm": 23.902215031162577, + "learning_rate": 9.288412618181305e-06, + "loss": 0.9644, + "step": 4495 + }, + { + "epoch": 0.77, + "grad_norm": 28.321553047312477, + "learning_rate": 9.285867634804961e-06, + "loss": 0.9682, + "step": 4500 + }, + { + "epoch": 0.77, + "grad_norm": 19.414695666211387, + "learning_rate": 9.283318458448711e-06, + "loss": 0.9748, + "step": 4505 + }, + { + "epoch": 0.77, + "grad_norm": 8.265335693282779, + "learning_rate": 9.280765091606481e-06, + "loss": 0.9667, + "step": 4510 + }, + { + "epoch": 0.77, + "grad_norm": 10.589118747517364, + "learning_rate": 9.2782075367763e-06, + "loss": 0.9871, + "step": 4515 + }, + { + "epoch": 0.77, + "grad_norm": 17.766284702178446, + "learning_rate": 9.275645796460292e-06, + "loss": 0.9696, + "step": 4520 + }, + { + "epoch": 0.77, + "grad_norm": 22.62383414229558, + "learning_rate": 9.273079873164676e-06, + "loss": 0.972, + "step": 4525 + }, + { + "epoch": 0.77, + "grad_norm": 22.746944385026122, + "learning_rate": 9.270509769399767e-06, + "loss": 0.9561, + "step": 4530 + }, + { + "epoch": 0.77, + "grad_norm": 13.635254266812355, + "learning_rate": 9.267935487679962e-06, + "loss": 0.9686, + "step": 4535 + }, + { + "epoch": 0.77, + "grad_norm": 26.008001828316356, + "learning_rate": 9.265357030523756e-06, + "loss": 0.9966, + "step": 4540 + }, + { + "epoch": 0.77, + "grad_norm": 19.690837446359836, + "learning_rate": 9.262774400453717e-06, + "loss": 0.9762, + "step": 4545 + }, + { + "epoch": 0.77, + "grad_norm": 9.165326461540976, + "learning_rate": 9.260187599996507e-06, + "loss": 0.9907, + "step": 4550 + }, + { + "epoch": 0.77, + "grad_norm": 16.63560083766881, + "learning_rate": 9.25759663168286e-06, + "loss": 0.9769, + "step": 4555 + }, + { + "epoch": 0.78, + "grad_norm": 29.926072811215636, + "learning_rate": 9.255001498047592e-06, + "loss": 0.9694, + "step": 4560 + }, + { + "epoch": 0.78, + "grad_norm": 43.916438852666026, + "learning_rate": 9.252402201629588e-06, + "loss": 0.9616, + "step": 4565 + }, + { + "epoch": 0.78, + "grad_norm": 36.62552865460111, + "learning_rate": 9.249798744971815e-06, + "loss": 0.9674, + "step": 4570 + }, + { + "epoch": 0.78, + "grad_norm": 44.70586590779445, + "learning_rate": 9.2471911306213e-06, + "loss": 0.9651, + "step": 4575 + }, + { + "epoch": 0.78, + "grad_norm": 24.188160765283822, + "learning_rate": 9.244579361129147e-06, + "loss": 0.9441, + "step": 4580 + }, + { + "epoch": 0.78, + "grad_norm": 16.006816269337822, + "learning_rate": 9.241963439050519e-06, + "loss": 0.9683, + "step": 4585 + }, + { + "epoch": 0.78, + "grad_norm": 10.293519001103515, + "learning_rate": 9.239343366944641e-06, + "loss": 0.967, + "step": 4590 + }, + { + "epoch": 0.78, + "grad_norm": 11.409536051527816, + "learning_rate": 9.236719147374801e-06, + "loss": 0.9567, + "step": 4595 + }, + { + "epoch": 0.78, + "grad_norm": 11.825285059132497, + "learning_rate": 9.234090782908346e-06, + "loss": 0.9572, + "step": 4600 + }, + { + "epoch": 0.78, + "grad_norm": 13.057182699834947, + "learning_rate": 9.231458276116676e-06, + "loss": 0.9747, + "step": 4605 + }, + { + "epoch": 0.78, + "grad_norm": 16.464766579821713, + "learning_rate": 9.22882162957524e-06, + "loss": 0.949, + "step": 4610 + }, + { + "epoch": 0.78, + "grad_norm": 21.69207643557753, + "learning_rate": 9.226180845863544e-06, + "loss": 0.9741, + "step": 4615 + }, + { + "epoch": 0.79, + "grad_norm": 38.55352202928315, + "learning_rate": 9.223535927565135e-06, + "loss": 0.9529, + "step": 4620 + }, + { + "epoch": 0.79, + "grad_norm": 34.96091841468431, + "learning_rate": 9.220886877267609e-06, + "loss": 0.9746, + "step": 4625 + }, + { + "epoch": 0.79, + "grad_norm": 20.01930871822319, + "learning_rate": 9.218233697562604e-06, + "loss": 0.9626, + "step": 4630 + }, + { + "epoch": 0.79, + "grad_norm": 11.324063907979856, + "learning_rate": 9.215576391045797e-06, + "loss": 0.9602, + "step": 4635 + }, + { + "epoch": 0.79, + "grad_norm": 9.150997568199399, + "learning_rate": 9.212914960316902e-06, + "loss": 0.9481, + "step": 4640 + }, + { + "epoch": 0.79, + "grad_norm": 41.68771819761749, + "learning_rate": 9.21024940797967e-06, + "loss": 0.9771, + "step": 4645 + }, + { + "epoch": 0.79, + "grad_norm": 73.61863057444658, + "learning_rate": 9.207579736641881e-06, + "loss": 0.9795, + "step": 4650 + }, + { + "epoch": 0.79, + "grad_norm": 38.792496873295825, + "learning_rate": 9.204905948915345e-06, + "loss": 0.9535, + "step": 4655 + }, + { + "epoch": 0.79, + "grad_norm": 51.91198314996973, + "learning_rate": 9.202228047415905e-06, + "loss": 0.9454, + "step": 4660 + }, + { + "epoch": 0.79, + "grad_norm": 66.68820192554747, + "learning_rate": 9.199546034763423e-06, + "loss": 0.9643, + "step": 4665 + }, + { + "epoch": 0.79, + "grad_norm": 115.50748156350188, + "learning_rate": 9.196859913581781e-06, + "loss": 0.978, + "step": 4670 + }, + { + "epoch": 0.79, + "grad_norm": 50.953571534266246, + "learning_rate": 9.194169686498887e-06, + "loss": 0.9425, + "step": 4675 + }, + { + "epoch": 0.8, + "grad_norm": 49.81249494752531, + "learning_rate": 9.191475356146661e-06, + "loss": 0.9647, + "step": 4680 + }, + { + "epoch": 0.8, + "grad_norm": 9.125148068708931, + "learning_rate": 9.188776925161042e-06, + "loss": 0.9427, + "step": 4685 + }, + { + "epoch": 0.8, + "grad_norm": 35.472294700071664, + "learning_rate": 9.186074396181974e-06, + "loss": 0.9474, + "step": 4690 + }, + { + "epoch": 0.8, + "grad_norm": 31.34096469566438, + "learning_rate": 9.183367771853417e-06, + "loss": 0.9613, + "step": 4695 + }, + { + "epoch": 0.8, + "grad_norm": 29.51902471391319, + "learning_rate": 9.180657054823334e-06, + "loss": 0.9462, + "step": 4700 + }, + { + "epoch": 0.8, + "grad_norm": 15.096319871398451, + "learning_rate": 9.17794224774369e-06, + "loss": 0.956, + "step": 4705 + }, + { + "epoch": 0.8, + "grad_norm": 9.97502452642927, + "learning_rate": 9.175223353270457e-06, + "loss": 0.9323, + "step": 4710 + }, + { + "epoch": 0.8, + "grad_norm": 11.373732134631773, + "learning_rate": 9.172500374063603e-06, + "loss": 0.9411, + "step": 4715 + }, + { + "epoch": 0.8, + "grad_norm": 8.572979968003773, + "learning_rate": 9.169773312787086e-06, + "loss": 0.9431, + "step": 4720 + }, + { + "epoch": 0.8, + "grad_norm": 17.165158198545925, + "learning_rate": 9.167042172108874e-06, + "loss": 0.9598, + "step": 4725 + }, + { + "epoch": 0.8, + "grad_norm": 9.683924164964983, + "learning_rate": 9.164306954700905e-06, + "loss": 0.9486, + "step": 4730 + }, + { + "epoch": 0.8, + "grad_norm": 15.224964756040686, + "learning_rate": 9.161567663239126e-06, + "loss": 0.9391, + "step": 4735 + }, + { + "epoch": 0.81, + "grad_norm": 10.376098770544099, + "learning_rate": 9.15882430040345e-06, + "loss": 0.963, + "step": 4740 + }, + { + "epoch": 0.81, + "grad_norm": 10.908609034274736, + "learning_rate": 9.15607686887779e-06, + "loss": 0.9311, + "step": 4745 + }, + { + "epoch": 0.81, + "grad_norm": 12.63840384715431, + "learning_rate": 9.153325371350028e-06, + "loss": 0.9697, + "step": 4750 + }, + { + "epoch": 0.81, + "grad_norm": 22.946614579574426, + "learning_rate": 9.150569810512033e-06, + "loss": 0.96, + "step": 4755 + }, + { + "epoch": 0.81, + "grad_norm": 13.167063738591903, + "learning_rate": 9.147810189059639e-06, + "loss": 0.9479, + "step": 4760 + }, + { + "epoch": 0.81, + "grad_norm": 37.92914695340067, + "learning_rate": 9.145046509692661e-06, + "loss": 0.9423, + "step": 4765 + }, + { + "epoch": 0.81, + "grad_norm": 24.365889248555195, + "learning_rate": 9.142278775114882e-06, + "loss": 0.9418, + "step": 4770 + }, + { + "epoch": 0.81, + "grad_norm": 67.68436185583097, + "learning_rate": 9.139506988034049e-06, + "loss": 0.9537, + "step": 4775 + }, + { + "epoch": 0.81, + "grad_norm": 39.491890864272584, + "learning_rate": 9.136731151161877e-06, + "loss": 0.9591, + "step": 4780 + }, + { + "epoch": 0.81, + "grad_norm": 45.38530370615395, + "learning_rate": 9.133951267214043e-06, + "loss": 0.9355, + "step": 4785 + }, + { + "epoch": 0.81, + "grad_norm": 14.378791634160802, + "learning_rate": 9.13116733891018e-06, + "loss": 0.9579, + "step": 4790 + }, + { + "epoch": 0.82, + "grad_norm": 45.370670351833056, + "learning_rate": 9.128379368973884e-06, + "loss": 0.9408, + "step": 4795 + }, + { + "epoch": 0.82, + "grad_norm": 22.49270893151392, + "learning_rate": 9.125587360132697e-06, + "loss": 0.9281, + "step": 4800 + }, + { + "epoch": 0.82, + "grad_norm": 43.64429040063124, + "learning_rate": 9.12279131511812e-06, + "loss": 0.9346, + "step": 4805 + }, + { + "epoch": 0.82, + "grad_norm": 9.215164955234982, + "learning_rate": 9.1199912366656e-06, + "loss": 0.9424, + "step": 4810 + }, + { + "epoch": 0.82, + "grad_norm": 11.742762494067678, + "learning_rate": 9.117187127514524e-06, + "loss": 0.9456, + "step": 4815 + }, + { + "epoch": 0.82, + "grad_norm": 7.727143118510911, + "learning_rate": 9.11437899040823e-06, + "loss": 0.9465, + "step": 4820 + }, + { + "epoch": 0.82, + "grad_norm": 10.832153090627127, + "learning_rate": 9.111566828093998e-06, + "loss": 0.9178, + "step": 4825 + }, + { + "epoch": 0.82, + "grad_norm": 9.630265380456894, + "learning_rate": 9.108750643323036e-06, + "loss": 0.9504, + "step": 4830 + }, + { + "epoch": 0.82, + "grad_norm": 19.444717713770846, + "learning_rate": 9.1059304388505e-06, + "loss": 0.9245, + "step": 4835 + }, + { + "epoch": 0.82, + "grad_norm": 24.329705280184676, + "learning_rate": 9.103106217435467e-06, + "loss": 0.9341, + "step": 4840 + }, + { + "epoch": 0.82, + "grad_norm": 27.83847152420756, + "learning_rate": 9.100277981840953e-06, + "loss": 0.9589, + "step": 4845 + }, + { + "epoch": 0.82, + "grad_norm": 11.50842107404206, + "learning_rate": 9.097445734833893e-06, + "loss": 0.9566, + "step": 4850 + }, + { + "epoch": 0.83, + "grad_norm": 43.06023501265569, + "learning_rate": 9.094609479185153e-06, + "loss": 0.9532, + "step": 4855 + }, + { + "epoch": 0.83, + "grad_norm": 50.720088892736904, + "learning_rate": 9.091769217669517e-06, + "loss": 0.9569, + "step": 4860 + }, + { + "epoch": 0.83, + "grad_norm": 62.24897567865879, + "learning_rate": 9.088924953065691e-06, + "loss": 0.9266, + "step": 4865 + }, + { + "epoch": 0.83, + "grad_norm": 57.80399144018989, + "learning_rate": 9.086076688156297e-06, + "loss": 0.9427, + "step": 4870 + }, + { + "epoch": 0.83, + "grad_norm": 8.46459487344967, + "learning_rate": 9.083224425727867e-06, + "loss": 0.9351, + "step": 4875 + }, + { + "epoch": 0.83, + "grad_norm": 16.698678860667545, + "learning_rate": 9.080368168570845e-06, + "loss": 0.9469, + "step": 4880 + }, + { + "epoch": 0.83, + "grad_norm": 9.687820627493341, + "learning_rate": 9.077507919479589e-06, + "loss": 0.9212, + "step": 4885 + }, + { + "epoch": 0.83, + "grad_norm": 45.913078252317284, + "learning_rate": 9.074643681252356e-06, + "loss": 0.9336, + "step": 4890 + }, + { + "epoch": 0.83, + "grad_norm": 32.80796660229274, + "learning_rate": 9.071775456691303e-06, + "loss": 0.9298, + "step": 4895 + }, + { + "epoch": 0.83, + "grad_norm": 10.983476920809734, + "learning_rate": 9.068903248602497e-06, + "loss": 0.9457, + "step": 4900 + }, + { + "epoch": 0.83, + "grad_norm": 10.211639785267453, + "learning_rate": 9.066027059795896e-06, + "loss": 0.9482, + "step": 4905 + }, + { + "epoch": 0.83, + "grad_norm": 10.51243616241861, + "learning_rate": 9.06314689308535e-06, + "loss": 0.934, + "step": 4910 + }, + { + "epoch": 0.84, + "grad_norm": 18.320268531763826, + "learning_rate": 9.060262751288607e-06, + "loss": 0.9434, + "step": 4915 + }, + { + "epoch": 0.84, + "grad_norm": 11.275224107377596, + "learning_rate": 9.057374637227299e-06, + "loss": 0.9162, + "step": 4920 + }, + { + "epoch": 0.84, + "grad_norm": 9.384577769247377, + "learning_rate": 9.054482553726946e-06, + "loss": 0.9366, + "step": 4925 + }, + { + "epoch": 0.84, + "grad_norm": 7.895425873893831, + "learning_rate": 9.051586503616952e-06, + "loss": 0.9241, + "step": 4930 + }, + { + "epoch": 0.84, + "grad_norm": 14.604982979399926, + "learning_rate": 9.0486864897306e-06, + "loss": 0.9256, + "step": 4935 + }, + { + "epoch": 0.84, + "grad_norm": 22.10558216735927, + "learning_rate": 9.045782514905052e-06, + "loss": 0.9175, + "step": 4940 + }, + { + "epoch": 0.84, + "grad_norm": 21.66537436134228, + "learning_rate": 9.042874581981347e-06, + "loss": 0.9241, + "step": 4945 + }, + { + "epoch": 0.84, + "grad_norm": 28.46117267926381, + "learning_rate": 9.03996269380439e-06, + "loss": 0.9401, + "step": 4950 + }, + { + "epoch": 0.84, + "grad_norm": 42.00707684144836, + "learning_rate": 9.037046853222963e-06, + "loss": 0.932, + "step": 4955 + }, + { + "epoch": 0.84, + "grad_norm": 64.63923023317382, + "learning_rate": 9.034127063089712e-06, + "loss": 0.9438, + "step": 4960 + }, + { + "epoch": 0.84, + "grad_norm": 78.80763586939075, + "learning_rate": 9.031203326261144e-06, + "loss": 0.9452, + "step": 4965 + }, + { + "epoch": 0.84, + "grad_norm": 75.57620927164031, + "learning_rate": 9.028275645597631e-06, + "loss": 0.9298, + "step": 4970 + }, + { + "epoch": 0.85, + "grad_norm": 8.658113859599196, + "learning_rate": 9.0253440239634e-06, + "loss": 0.923, + "step": 4975 + }, + { + "epoch": 0.85, + "grad_norm": 48.495850283320955, + "learning_rate": 9.022408464226541e-06, + "loss": 0.9365, + "step": 4980 + }, + { + "epoch": 0.85, + "grad_norm": 37.806366636999435, + "learning_rate": 9.019468969258985e-06, + "loss": 0.9413, + "step": 4985 + }, + { + "epoch": 0.85, + "grad_norm": 57.21603624921526, + "learning_rate": 9.01652554193652e-06, + "loss": 0.9333, + "step": 4990 + }, + { + "epoch": 0.85, + "grad_norm": 40.6670860223311, + "learning_rate": 9.013578185138784e-06, + "loss": 0.9312, + "step": 4995 + }, + { + "epoch": 0.85, + "grad_norm": 14.668175890753217, + "learning_rate": 9.010626901749254e-06, + "loss": 0.9295, + "step": 5000 + }, + { + "epoch": 0.85, + "grad_norm": 40.73927855960309, + "learning_rate": 9.00767169465525e-06, + "loss": 0.9169, + "step": 5005 + }, + { + "epoch": 0.85, + "grad_norm": 12.109205560365105, + "learning_rate": 9.004712566747929e-06, + "loss": 0.916, + "step": 5010 + }, + { + "epoch": 0.85, + "grad_norm": 7.781500255230441, + "learning_rate": 9.001749520922289e-06, + "loss": 0.9198, + "step": 5015 + }, + { + "epoch": 0.85, + "grad_norm": 25.48428047305956, + "learning_rate": 8.998782560077155e-06, + "loss": 0.9068, + "step": 5020 + }, + { + "epoch": 0.85, + "grad_norm": 32.98125695456632, + "learning_rate": 8.995811687115186e-06, + "loss": 0.932, + "step": 5025 + }, + { + "epoch": 0.86, + "grad_norm": 14.97283215980044, + "learning_rate": 8.992836904942865e-06, + "loss": 0.9379, + "step": 5030 + }, + { + "epoch": 0.86, + "grad_norm": 9.253049260277942, + "learning_rate": 8.989858216470507e-06, + "loss": 0.918, + "step": 5035 + }, + { + "epoch": 0.86, + "grad_norm": 12.544214252920154, + "learning_rate": 8.986875624612236e-06, + "loss": 0.913, + "step": 5040 + }, + { + "epoch": 0.86, + "grad_norm": 22.04564956071582, + "learning_rate": 8.98388913228601e-06, + "loss": 0.9178, + "step": 5045 + }, + { + "epoch": 0.86, + "grad_norm": 17.683536399577275, + "learning_rate": 8.980898742413587e-06, + "loss": 0.9304, + "step": 5050 + }, + { + "epoch": 0.86, + "grad_norm": 14.161467334972684, + "learning_rate": 8.977904457920552e-06, + "loss": 0.9145, + "step": 5055 + }, + { + "epoch": 0.86, + "grad_norm": 8.577177096548642, + "learning_rate": 8.974906281736291e-06, + "loss": 0.914, + "step": 5060 + }, + { + "epoch": 0.86, + "grad_norm": 13.327368511504124, + "learning_rate": 8.971904216794002e-06, + "loss": 0.9043, + "step": 5065 + }, + { + "epoch": 0.86, + "grad_norm": 21.406488269474284, + "learning_rate": 8.968898266030688e-06, + "loss": 0.9202, + "step": 5070 + }, + { + "epoch": 0.86, + "grad_norm": 31.269621520553603, + "learning_rate": 8.965888432387147e-06, + "loss": 0.9151, + "step": 5075 + }, + { + "epoch": 0.86, + "grad_norm": 18.802503941578262, + "learning_rate": 8.962874718807984e-06, + "loss": 0.9097, + "step": 5080 + }, + { + "epoch": 0.86, + "grad_norm": 18.848981198337093, + "learning_rate": 8.959857128241596e-06, + "loss": 0.9025, + "step": 5085 + }, + { + "epoch": 0.87, + "grad_norm": 12.072241314901154, + "learning_rate": 8.956835663640173e-06, + "loss": 0.9138, + "step": 5090 + }, + { + "epoch": 0.87, + "grad_norm": 9.332658464526737, + "learning_rate": 8.953810327959693e-06, + "loss": 0.9048, + "step": 5095 + }, + { + "epoch": 0.87, + "grad_norm": 21.132008030484357, + "learning_rate": 8.950781124159926e-06, + "loss": 0.9178, + "step": 5100 + }, + { + "epoch": 0.87, + "grad_norm": 39.50889372852151, + "learning_rate": 8.947748055204424e-06, + "loss": 0.9219, + "step": 5105 + }, + { + "epoch": 0.87, + "grad_norm": 21.346258805968034, + "learning_rate": 8.944711124060519e-06, + "loss": 0.9192, + "step": 5110 + }, + { + "epoch": 0.87, + "grad_norm": 8.112949405602714, + "learning_rate": 8.941670333699323e-06, + "loss": 0.9018, + "step": 5115 + }, + { + "epoch": 0.87, + "grad_norm": 35.026352865269004, + "learning_rate": 8.938625687095723e-06, + "loss": 0.9051, + "step": 5120 + }, + { + "epoch": 0.87, + "grad_norm": 21.948641624705495, + "learning_rate": 8.93557718722838e-06, + "loss": 0.9302, + "step": 5125 + }, + { + "epoch": 0.87, + "grad_norm": 13.460413331641375, + "learning_rate": 8.932524837079721e-06, + "loss": 0.9094, + "step": 5130 + }, + { + "epoch": 0.87, + "grad_norm": 33.16554501484199, + "learning_rate": 8.929468639635946e-06, + "loss": 0.9112, + "step": 5135 + }, + { + "epoch": 0.87, + "grad_norm": 55.507150253651496, + "learning_rate": 8.926408597887013e-06, + "loss": 0.9076, + "step": 5140 + }, + { + "epoch": 0.87, + "grad_norm": 25.672990324546266, + "learning_rate": 8.923344714826646e-06, + "loss": 0.9157, + "step": 5145 + }, + { + "epoch": 0.88, + "grad_norm": 12.314062818429154, + "learning_rate": 8.920276993452319e-06, + "loss": 0.9138, + "step": 5150 + }, + { + "epoch": 0.88, + "grad_norm": 9.4944502059902, + "learning_rate": 8.917205436765272e-06, + "loss": 0.9034, + "step": 5155 + }, + { + "epoch": 0.88, + "grad_norm": 10.607776879666043, + "learning_rate": 8.914130047770488e-06, + "loss": 0.9163, + "step": 5160 + }, + { + "epoch": 0.88, + "grad_norm": 15.664686781195138, + "learning_rate": 8.911050829476707e-06, + "loss": 0.9142, + "step": 5165 + }, + { + "epoch": 0.88, + "grad_norm": 13.415556266796443, + "learning_rate": 8.90796778489641e-06, + "loss": 0.9195, + "step": 5170 + }, + { + "epoch": 0.88, + "grad_norm": 20.298844004102687, + "learning_rate": 8.90488091704582e-06, + "loss": 0.9105, + "step": 5175 + }, + { + "epoch": 0.88, + "grad_norm": 9.737019741824055, + "learning_rate": 8.901790228944904e-06, + "loss": 0.9048, + "step": 5180 + }, + { + "epoch": 0.88, + "grad_norm": 13.18933269117288, + "learning_rate": 8.898695723617368e-06, + "loss": 0.9123, + "step": 5185 + }, + { + "epoch": 0.88, + "grad_norm": 27.076492095174448, + "learning_rate": 8.895597404090647e-06, + "loss": 0.9173, + "step": 5190 + }, + { + "epoch": 0.88, + "grad_norm": 17.017497657176467, + "learning_rate": 8.892495273395913e-06, + "loss": 0.9235, + "step": 5195 + }, + { + "epoch": 0.88, + "grad_norm": 13.241184424854241, + "learning_rate": 8.889389334568061e-06, + "loss": 0.8984, + "step": 5200 + }, + { + "epoch": 0.88, + "grad_norm": 14.896688029270276, + "learning_rate": 8.88627959064572e-06, + "loss": 0.8982, + "step": 5205 + }, + { + "epoch": 0.89, + "grad_norm": 10.933863082457536, + "learning_rate": 8.883166044671232e-06, + "loss": 0.9027, + "step": 5210 + }, + { + "epoch": 0.89, + "grad_norm": 21.301815661889073, + "learning_rate": 8.880048699690664e-06, + "loss": 0.8975, + "step": 5215 + }, + { + "epoch": 0.89, + "grad_norm": 17.79520585776361, + "learning_rate": 8.876927558753798e-06, + "loss": 0.9131, + "step": 5220 + }, + { + "epoch": 0.89, + "grad_norm": 69.54083736297005, + "learning_rate": 8.873802624914132e-06, + "loss": 0.8882, + "step": 5225 + }, + { + "epoch": 0.89, + "grad_norm": 29.51640291474208, + "learning_rate": 8.870673901228874e-06, + "loss": 0.8986, + "step": 5230 + }, + { + "epoch": 0.89, + "grad_norm": 48.5416443853786, + "learning_rate": 8.867541390758935e-06, + "loss": 0.8993, + "step": 5235 + }, + { + "epoch": 0.89, + "grad_norm": 45.16022542591326, + "learning_rate": 8.864405096568937e-06, + "loss": 0.8991, + "step": 5240 + }, + { + "epoch": 0.89, + "grad_norm": 48.08298242632971, + "learning_rate": 8.861265021727202e-06, + "loss": 0.8939, + "step": 5245 + }, + { + "epoch": 0.89, + "grad_norm": 14.156779794459752, + "learning_rate": 8.858121169305747e-06, + "loss": 0.9005, + "step": 5250 + }, + { + "epoch": 0.89, + "grad_norm": 16.63891538193591, + "learning_rate": 8.854973542380289e-06, + "loss": 0.907, + "step": 5255 + }, + { + "epoch": 0.89, + "grad_norm": 48.7776713914096, + "learning_rate": 8.851822144030237e-06, + "loss": 0.9039, + "step": 5260 + }, + { + "epoch": 0.9, + "grad_norm": 55.13232465033205, + "learning_rate": 8.848666977338689e-06, + "loss": 0.8962, + "step": 5265 + }, + { + "epoch": 0.9, + "grad_norm": 29.5918085171386, + "learning_rate": 8.84550804539243e-06, + "loss": 0.9048, + "step": 5270 + }, + { + "epoch": 0.9, + "grad_norm": 25.783491763675134, + "learning_rate": 8.842345351281927e-06, + "loss": 0.9006, + "step": 5275 + }, + { + "epoch": 0.9, + "grad_norm": 38.42385751738853, + "learning_rate": 8.839178898101327e-06, + "loss": 0.9112, + "step": 5280 + }, + { + "epoch": 0.9, + "grad_norm": 17.363290467768564, + "learning_rate": 8.836008688948463e-06, + "loss": 0.9023, + "step": 5285 + }, + { + "epoch": 0.9, + "grad_norm": 11.318139570581586, + "learning_rate": 8.832834726924832e-06, + "loss": 0.9123, + "step": 5290 + }, + { + "epoch": 0.9, + "grad_norm": 10.274091473033106, + "learning_rate": 8.829657015135605e-06, + "loss": 0.8881, + "step": 5295 + }, + { + "epoch": 0.9, + "grad_norm": 9.354741961314343, + "learning_rate": 8.826475556689627e-06, + "loss": 0.8992, + "step": 5300 + }, + { + "epoch": 0.9, + "grad_norm": 22.730814539741452, + "learning_rate": 8.8232903546994e-06, + "loss": 0.8945, + "step": 5305 + }, + { + "epoch": 0.9, + "grad_norm": 11.52278714976617, + "learning_rate": 8.820101412281098e-06, + "loss": 0.9051, + "step": 5310 + }, + { + "epoch": 0.9, + "grad_norm": 21.82268336044992, + "learning_rate": 8.816908732554546e-06, + "loss": 0.8859, + "step": 5315 + }, + { + "epoch": 0.9, + "grad_norm": 10.685162774578716, + "learning_rate": 8.81371231864323e-06, + "loss": 0.896, + "step": 5320 + }, + { + "epoch": 0.91, + "grad_norm": 18.962361513069325, + "learning_rate": 8.810512173674288e-06, + "loss": 0.8869, + "step": 5325 + }, + { + "epoch": 0.91, + "grad_norm": 17.735749882533618, + "learning_rate": 8.807308300778508e-06, + "loss": 0.9119, + "step": 5330 + }, + { + "epoch": 0.91, + "grad_norm": 32.718411258492466, + "learning_rate": 8.804100703090324e-06, + "loss": 0.8902, + "step": 5335 + }, + { + "epoch": 0.91, + "grad_norm": 17.748223147493437, + "learning_rate": 8.800889383747817e-06, + "loss": 0.8814, + "step": 5340 + }, + { + "epoch": 0.91, + "grad_norm": 7.8215112357413545, + "learning_rate": 8.797674345892707e-06, + "loss": 0.8917, + "step": 5345 + }, + { + "epoch": 0.91, + "grad_norm": 9.491577453092495, + "learning_rate": 8.794455592670353e-06, + "loss": 0.8965, + "step": 5350 + }, + { + "epoch": 0.91, + "grad_norm": 30.099277973463792, + "learning_rate": 8.791233127229746e-06, + "loss": 0.8815, + "step": 5355 + }, + { + "epoch": 0.91, + "grad_norm": 21.471034023934507, + "learning_rate": 8.788006952723512e-06, + "loss": 0.9014, + "step": 5360 + }, + { + "epoch": 0.91, + "grad_norm": 21.232711801308177, + "learning_rate": 8.784777072307904e-06, + "loss": 0.892, + "step": 5365 + }, + { + "epoch": 0.91, + "grad_norm": 7.428510179914167, + "learning_rate": 8.781543489142802e-06, + "loss": 0.872, + "step": 5370 + }, + { + "epoch": 0.91, + "grad_norm": 11.45373486754463, + "learning_rate": 8.778306206391707e-06, + "loss": 0.8938, + "step": 5375 + }, + { + "epoch": 0.91, + "grad_norm": 13.15825587665837, + "learning_rate": 8.775065227221742e-06, + "loss": 0.8897, + "step": 5380 + }, + { + "epoch": 0.92, + "grad_norm": 20.21159615999797, + "learning_rate": 8.77182055480364e-06, + "loss": 0.8693, + "step": 5385 + }, + { + "epoch": 0.92, + "grad_norm": 20.704802237923644, + "learning_rate": 8.768572192311757e-06, + "loss": 0.9013, + "step": 5390 + }, + { + "epoch": 0.92, + "grad_norm": 19.60393581725405, + "learning_rate": 8.76532014292405e-06, + "loss": 0.8802, + "step": 5395 + }, + { + "epoch": 0.92, + "grad_norm": 32.697742283265754, + "learning_rate": 8.762064409822086e-06, + "loss": 0.9065, + "step": 5400 + }, + { + "epoch": 0.92, + "grad_norm": 34.84197654856875, + "learning_rate": 8.758804996191039e-06, + "loss": 0.8728, + "step": 5405 + }, + { + "epoch": 0.92, + "grad_norm": 14.215924467751915, + "learning_rate": 8.75554190521968e-06, + "loss": 0.873, + "step": 5410 + }, + { + "epoch": 0.92, + "grad_norm": 12.416431470943925, + "learning_rate": 8.752275140100379e-06, + "loss": 0.8839, + "step": 5415 + }, + { + "epoch": 0.92, + "grad_norm": 11.803589736885627, + "learning_rate": 8.749004704029101e-06, + "loss": 0.8829, + "step": 5420 + }, + { + "epoch": 0.92, + "grad_norm": 10.893782852593526, + "learning_rate": 8.745730600205402e-06, + "loss": 0.8834, + "step": 5425 + }, + { + "epoch": 0.92, + "grad_norm": 13.70609303531889, + "learning_rate": 8.742452831832424e-06, + "loss": 0.8986, + "step": 5430 + }, + { + "epoch": 0.92, + "grad_norm": 23.16719204178583, + "learning_rate": 8.7391714021169e-06, + "loss": 0.9119, + "step": 5435 + }, + { + "epoch": 0.92, + "grad_norm": 8.350841301575068, + "learning_rate": 8.735886314269136e-06, + "loss": 0.8937, + "step": 5440 + }, + { + "epoch": 0.93, + "grad_norm": 10.644162093171502, + "learning_rate": 8.732597571503028e-06, + "loss": 0.8847, + "step": 5445 + }, + { + "epoch": 0.93, + "grad_norm": 26.775136944455053, + "learning_rate": 8.729305177036035e-06, + "loss": 0.8868, + "step": 5450 + }, + { + "epoch": 0.93, + "grad_norm": 16.916084579118998, + "learning_rate": 8.726009134089202e-06, + "loss": 0.8925, + "step": 5455 + }, + { + "epoch": 0.93, + "grad_norm": 25.97610450325379, + "learning_rate": 8.722709445887132e-06, + "loss": 0.888, + "step": 5460 + }, + { + "epoch": 0.93, + "grad_norm": 16.674169978898313, + "learning_rate": 8.719406115658002e-06, + "loss": 0.8882, + "step": 5465 + }, + { + "epoch": 0.93, + "grad_norm": 9.1421190490301, + "learning_rate": 8.716099146633548e-06, + "loss": 0.8817, + "step": 5470 + }, + { + "epoch": 0.93, + "grad_norm": 12.709501995828935, + "learning_rate": 8.712788542049066e-06, + "loss": 0.8745, + "step": 5475 + }, + { + "epoch": 0.93, + "grad_norm": 11.302338450780343, + "learning_rate": 8.70947430514341e-06, + "loss": 0.8794, + "step": 5480 + }, + { + "epoch": 0.93, + "grad_norm": 25.359202453596357, + "learning_rate": 8.706156439158988e-06, + "loss": 0.8961, + "step": 5485 + }, + { + "epoch": 0.93, + "grad_norm": 14.57404538528837, + "learning_rate": 8.702834947341759e-06, + "loss": 0.8548, + "step": 5490 + }, + { + "epoch": 0.93, + "grad_norm": 8.621561636096867, + "learning_rate": 8.699509832941224e-06, + "loss": 0.8827, + "step": 5495 + }, + { + "epoch": 0.94, + "grad_norm": 10.556414130907989, + "learning_rate": 8.696181099210436e-06, + "loss": 0.899, + "step": 5500 + }, + { + "epoch": 0.94, + "grad_norm": 20.965519473228266, + "learning_rate": 8.692848749405985e-06, + "loss": 0.8852, + "step": 5505 + }, + { + "epoch": 0.94, + "grad_norm": 7.702078440520662, + "learning_rate": 8.689512786787996e-06, + "loss": 0.8554, + "step": 5510 + }, + { + "epoch": 0.94, + "grad_norm": 15.307984622928407, + "learning_rate": 8.686173214620134e-06, + "loss": 0.8872, + "step": 5515 + }, + { + "epoch": 0.94, + "grad_norm": 36.1312012987165, + "learning_rate": 8.68283003616959e-06, + "loss": 0.8548, + "step": 5520 + }, + { + "epoch": 0.94, + "grad_norm": 13.263820091037312, + "learning_rate": 8.679483254707089e-06, + "loss": 0.8847, + "step": 5525 + }, + { + "epoch": 0.94, + "grad_norm": 13.888523618921328, + "learning_rate": 8.676132873506873e-06, + "loss": 0.8723, + "step": 5530 + }, + { + "epoch": 0.94, + "grad_norm": 7.882036703727629, + "learning_rate": 8.672778895846715e-06, + "loss": 0.8834, + "step": 5535 + }, + { + "epoch": 0.94, + "grad_norm": 20.865318506881103, + "learning_rate": 8.669421325007897e-06, + "loss": 0.8802, + "step": 5540 + }, + { + "epoch": 0.94, + "grad_norm": 11.716621596417292, + "learning_rate": 8.666060164275224e-06, + "loss": 0.8843, + "step": 5545 + }, + { + "epoch": 0.94, + "grad_norm": 9.636304054762679, + "learning_rate": 8.66269541693701e-06, + "loss": 0.8835, + "step": 5550 + }, + { + "epoch": 0.94, + "grad_norm": 8.36733991467472, + "learning_rate": 8.659327086285079e-06, + "loss": 0.8919, + "step": 5555 + }, + { + "epoch": 0.95, + "grad_norm": 33.84809764114348, + "learning_rate": 8.655955175614758e-06, + "loss": 0.8774, + "step": 5560 + }, + { + "epoch": 0.95, + "grad_norm": 39.82957262336742, + "learning_rate": 8.65257968822488e-06, + "loss": 0.8783, + "step": 5565 + }, + { + "epoch": 0.95, + "grad_norm": 67.01059353943249, + "learning_rate": 8.649200627417774e-06, + "loss": 0.8784, + "step": 5570 + }, + { + "epoch": 0.95, + "grad_norm": 45.26710543417326, + "learning_rate": 8.645817996499264e-06, + "loss": 0.866, + "step": 5575 + }, + { + "epoch": 0.95, + "grad_norm": 74.55636509273494, + "learning_rate": 8.642431798778675e-06, + "loss": 0.8906, + "step": 5580 + }, + { + "epoch": 0.95, + "grad_norm": 18.565055424673996, + "learning_rate": 8.63904203756881e-06, + "loss": 0.8829, + "step": 5585 + }, + { + "epoch": 0.95, + "grad_norm": 23.61630072608257, + "learning_rate": 8.635648716185964e-06, + "loss": 0.8635, + "step": 5590 + }, + { + "epoch": 0.95, + "grad_norm": 17.35054245441266, + "learning_rate": 8.632251837949921e-06, + "loss": 0.8593, + "step": 5595 + }, + { + "epoch": 0.95, + "grad_norm": 8.106255161947752, + "learning_rate": 8.628851406183932e-06, + "loss": 0.884, + "step": 5600 + }, + { + "epoch": 0.95, + "grad_norm": 13.889113128468237, + "learning_rate": 8.625447424214734e-06, + "loss": 0.8813, + "step": 5605 + }, + { + "epoch": 0.95, + "grad_norm": 12.865051244498067, + "learning_rate": 8.622039895372533e-06, + "loss": 0.8618, + "step": 5610 + }, + { + "epoch": 0.95, + "grad_norm": 9.349655012279877, + "learning_rate": 8.618628822991009e-06, + "loss": 0.8655, + "step": 5615 + }, + { + "epoch": 0.96, + "grad_norm": 23.20553059319047, + "learning_rate": 8.615214210407304e-06, + "loss": 0.8658, + "step": 5620 + }, + { + "epoch": 0.96, + "grad_norm": 30.110412035643414, + "learning_rate": 8.611796060962025e-06, + "loss": 0.8619, + "step": 5625 + }, + { + "epoch": 0.96, + "grad_norm": 8.021610994380586, + "learning_rate": 8.608374377999242e-06, + "loss": 0.8635, + "step": 5630 + }, + { + "epoch": 0.96, + "grad_norm": 7.71017469605099, + "learning_rate": 8.604949164866478e-06, + "loss": 0.8625, + "step": 5635 + }, + { + "epoch": 0.96, + "grad_norm": 16.572210414926698, + "learning_rate": 8.601520424914712e-06, + "loss": 0.8858, + "step": 5640 + }, + { + "epoch": 0.96, + "grad_norm": 22.351789187509702, + "learning_rate": 8.598088161498372e-06, + "loss": 0.8571, + "step": 5645 + }, + { + "epoch": 0.96, + "grad_norm": 36.86554438746097, + "learning_rate": 8.594652377975335e-06, + "loss": 0.8658, + "step": 5650 + }, + { + "epoch": 0.96, + "grad_norm": 20.15626490218535, + "learning_rate": 8.591213077706918e-06, + "loss": 0.88, + "step": 5655 + }, + { + "epoch": 0.96, + "grad_norm": 17.06297849796565, + "learning_rate": 8.587770264057887e-06, + "loss": 0.8498, + "step": 5660 + }, + { + "epoch": 0.96, + "grad_norm": 10.00576879340255, + "learning_rate": 8.584323940396435e-06, + "loss": 0.8561, + "step": 5665 + }, + { + "epoch": 0.96, + "grad_norm": 39.968686341276495, + "learning_rate": 8.580874110094193e-06, + "loss": 0.8756, + "step": 5670 + }, + { + "epoch": 0.96, + "grad_norm": 26.27040895403256, + "learning_rate": 8.577420776526225e-06, + "loss": 0.8655, + "step": 5675 + }, + { + "epoch": 0.97, + "grad_norm": 11.695621589411635, + "learning_rate": 8.57396394307102e-06, + "loss": 0.8633, + "step": 5680 + }, + { + "epoch": 0.97, + "grad_norm": 15.951644924859142, + "learning_rate": 8.57050361311049e-06, + "loss": 0.8641, + "step": 5685 + }, + { + "epoch": 0.97, + "grad_norm": 14.871755754368417, + "learning_rate": 8.567039790029972e-06, + "loss": 0.8441, + "step": 5690 + }, + { + "epoch": 0.97, + "grad_norm": 18.86324287898087, + "learning_rate": 8.563572477218216e-06, + "loss": 0.8558, + "step": 5695 + }, + { + "epoch": 0.97, + "grad_norm": 12.9497961055212, + "learning_rate": 8.560101678067385e-06, + "loss": 0.8626, + "step": 5700 + }, + { + "epoch": 0.97, + "grad_norm": 12.64541028079422, + "learning_rate": 8.55662739597306e-06, + "loss": 0.8723, + "step": 5705 + }, + { + "epoch": 0.97, + "grad_norm": 8.077404639704291, + "learning_rate": 8.553149634334221e-06, + "loss": 0.8498, + "step": 5710 + }, + { + "epoch": 0.97, + "grad_norm": 14.095385130597634, + "learning_rate": 8.54966839655326e-06, + "loss": 0.8676, + "step": 5715 + }, + { + "epoch": 0.97, + "grad_norm": 16.816198204667515, + "learning_rate": 8.546183686035963e-06, + "loss": 0.86, + "step": 5720 + }, + { + "epoch": 0.97, + "grad_norm": 10.324141871325514, + "learning_rate": 8.542695506191516e-06, + "loss": 0.8453, + "step": 5725 + }, + { + "epoch": 0.97, + "grad_norm": 22.931657761275982, + "learning_rate": 8.5392038604325e-06, + "loss": 0.8624, + "step": 5730 + }, + { + "epoch": 0.98, + "grad_norm": 22.340937938883243, + "learning_rate": 8.535708752174887e-06, + "loss": 0.8584, + "step": 5735 + }, + { + "epoch": 0.98, + "grad_norm": 13.47137594925963, + "learning_rate": 8.532210184838034e-06, + "loss": 0.8562, + "step": 5740 + }, + { + "epoch": 0.98, + "grad_norm": 7.022989456393278, + "learning_rate": 8.528708161844688e-06, + "loss": 0.8517, + "step": 5745 + }, + { + "epoch": 0.98, + "grad_norm": 14.389404070039904, + "learning_rate": 8.525202686620968e-06, + "loss": 0.8591, + "step": 5750 + }, + { + "epoch": 0.98, + "grad_norm": 11.069368671975361, + "learning_rate": 8.521693762596376e-06, + "loss": 0.8572, + "step": 5755 + }, + { + "epoch": 0.98, + "grad_norm": 31.320280806061785, + "learning_rate": 8.518181393203787e-06, + "loss": 0.868, + "step": 5760 + }, + { + "epoch": 0.98, + "grad_norm": 8.119093866352447, + "learning_rate": 8.514665581879448e-06, + "loss": 0.8533, + "step": 5765 + }, + { + "epoch": 0.98, + "grad_norm": 20.745899092682233, + "learning_rate": 8.511146332062971e-06, + "loss": 0.8492, + "step": 5770 + }, + { + "epoch": 0.98, + "grad_norm": 9.102720102074791, + "learning_rate": 8.507623647197334e-06, + "loss": 0.8752, + "step": 5775 + }, + { + "epoch": 0.98, + "grad_norm": 13.17053377877784, + "learning_rate": 8.504097530728875e-06, + "loss": 0.8411, + "step": 5780 + }, + { + "epoch": 0.98, + "grad_norm": 28.606022516403232, + "learning_rate": 8.500567986107286e-06, + "loss": 0.8534, + "step": 5785 + }, + { + "epoch": 0.98, + "grad_norm": 27.615463405818634, + "learning_rate": 8.497035016785617e-06, + "loss": 0.8369, + "step": 5790 + }, + { + "epoch": 0.99, + "grad_norm": 36.01795392268902, + "learning_rate": 8.49349862622027e-06, + "loss": 0.8488, + "step": 5795 + }, + { + "epoch": 0.99, + "grad_norm": 13.213418700089807, + "learning_rate": 8.489958817870987e-06, + "loss": 0.8535, + "step": 5800 + }, + { + "epoch": 0.99, + "grad_norm": 15.444031330332194, + "learning_rate": 8.486415595200862e-06, + "loss": 0.8453, + "step": 5805 + }, + { + "epoch": 0.99, + "grad_norm": 10.335189050348509, + "learning_rate": 8.482868961676321e-06, + "loss": 0.8573, + "step": 5810 + }, + { + "epoch": 0.99, + "grad_norm": 15.00363654906774, + "learning_rate": 8.479318920767133e-06, + "loss": 0.8515, + "step": 5815 + }, + { + "epoch": 0.99, + "grad_norm": 13.073163283981378, + "learning_rate": 8.4757654759464e-06, + "loss": 0.8542, + "step": 5820 + }, + { + "epoch": 0.99, + "grad_norm": 29.39872449053244, + "learning_rate": 8.472208630690553e-06, + "loss": 0.8415, + "step": 5825 + }, + { + "epoch": 0.99, + "grad_norm": 12.515141299600042, + "learning_rate": 8.468648388479347e-06, + "loss": 0.8381, + "step": 5830 + }, + { + "epoch": 0.99, + "grad_norm": 31.37491484904034, + "learning_rate": 8.465084752795867e-06, + "loss": 0.8603, + "step": 5835 + }, + { + "epoch": 0.99, + "grad_norm": 58.3350303623845, + "learning_rate": 8.461517727126511e-06, + "loss": 0.836, + "step": 5840 + }, + { + "epoch": 0.99, + "grad_norm": 29.165439866135134, + "learning_rate": 8.457947314960996e-06, + "loss": 0.858, + "step": 5845 + }, + { + "epoch": 0.99, + "grad_norm": 11.285011618464788, + "learning_rate": 8.454373519792355e-06, + "loss": 0.8491, + "step": 5850 + }, + { + "epoch": 1.0, + "grad_norm": 11.832032196171967, + "learning_rate": 8.450796345116926e-06, + "loss": 0.8352, + "step": 5855 + }, + { + "epoch": 1.0, + "grad_norm": 10.440740651835677, + "learning_rate": 8.447215794434356e-06, + "loss": 0.8607, + "step": 5860 + }, + { + "epoch": 1.0, + "grad_norm": 53.62948227734646, + "learning_rate": 8.443631871247598e-06, + "loss": 0.8758, + "step": 5865 + }, + { + "epoch": 1.0, + "grad_norm": 40.84158978189534, + "learning_rate": 8.440044579062894e-06, + "loss": 0.8418, + "step": 5870 + }, + { + "epoch": 1.0, + "grad_norm": 8.478218209150542, + "learning_rate": 8.436453921389791e-06, + "loss": 0.8503, + "step": 5875 + }, + { + "epoch": 1.0, + "grad_norm": 14.378497072180087, + "learning_rate": 8.43285990174113e-06, + "loss": 0.8671, + "step": 5880 + }, + { + "epoch": 1.0, + "eval_loss": 0.7445269227027893, + "eval_runtime": 75.2523, + "eval_samples_per_second": 4.81, + "eval_steps_per_second": 0.611, + "step": 5882 + }, + { + "epoch": 1.0, + "grad_norm": 16.074131915930433, + "learning_rate": 8.429262523633034e-06, + "loss": 0.7902, + "step": 5885 + }, + { + "epoch": 1.0, + "grad_norm": 7.338805256843147, + "learning_rate": 8.425661790584916e-06, + "loss": 0.7448, + "step": 5890 + }, + { + "epoch": 1.0, + "grad_norm": 21.507136158145123, + "learning_rate": 8.422057706119468e-06, + "loss": 0.7342, + "step": 5895 + }, + { + "epoch": 1.0, + "grad_norm": 43.03840171835638, + "learning_rate": 8.418450273762665e-06, + "loss": 0.7323, + "step": 5900 + }, + { + "epoch": 1.0, + "grad_norm": 21.45456264756192, + "learning_rate": 8.41483949704376e-06, + "loss": 0.7333, + "step": 5905 + }, + { + "epoch": 1.0, + "grad_norm": 17.793455098771023, + "learning_rate": 8.411225379495265e-06, + "loss": 0.7466, + "step": 5910 + }, + { + "epoch": 1.01, + "grad_norm": 9.254994967182895, + "learning_rate": 8.407607924652971e-06, + "loss": 0.7464, + "step": 5915 + }, + { + "epoch": 1.01, + "grad_norm": 18.7584036810001, + "learning_rate": 8.403987136055935e-06, + "loss": 0.7299, + "step": 5920 + }, + { + "epoch": 1.01, + "grad_norm": 13.026506169069334, + "learning_rate": 8.40036301724647e-06, + "loss": 0.7196, + "step": 5925 + }, + { + "epoch": 1.01, + "grad_norm": 9.346466183636792, + "learning_rate": 8.39673557177015e-06, + "loss": 0.7511, + "step": 5930 + }, + { + "epoch": 1.01, + "grad_norm": 18.495927112645926, + "learning_rate": 8.3931048031758e-06, + "loss": 0.7254, + "step": 5935 + }, + { + "epoch": 1.01, + "grad_norm": 17.709617231851123, + "learning_rate": 8.389470715015501e-06, + "loss": 0.7286, + "step": 5940 + }, + { + "epoch": 1.01, + "grad_norm": 6.780660397689895, + "learning_rate": 8.385833310844582e-06, + "loss": 0.7139, + "step": 5945 + }, + { + "epoch": 1.01, + "grad_norm": 11.032615580465922, + "learning_rate": 8.382192594221608e-06, + "loss": 0.749, + "step": 5950 + }, + { + "epoch": 1.01, + "grad_norm": 25.25163294724016, + "learning_rate": 8.378548568708396e-06, + "loss": 0.7395, + "step": 5955 + }, + { + "epoch": 1.01, + "grad_norm": 16.169665624561432, + "learning_rate": 8.374901237869989e-06, + "loss": 0.7303, + "step": 5960 + }, + { + "epoch": 1.01, + "grad_norm": 26.34099151157238, + "learning_rate": 8.371250605274673e-06, + "loss": 0.7056, + "step": 5965 + }, + { + "epoch": 1.01, + "grad_norm": 32.873244414003075, + "learning_rate": 8.367596674493959e-06, + "loss": 0.7414, + "step": 5970 + }, + { + "epoch": 1.02, + "grad_norm": 30.87560803986548, + "learning_rate": 8.363939449102586e-06, + "loss": 0.7282, + "step": 5975 + }, + { + "epoch": 1.02, + "grad_norm": 19.528015029331623, + "learning_rate": 8.360278932678515e-06, + "loss": 0.7305, + "step": 5980 + }, + { + "epoch": 1.02, + "grad_norm": 9.445981084181716, + "learning_rate": 8.356615128802933e-06, + "loss": 0.7212, + "step": 5985 + }, + { + "epoch": 1.02, + "grad_norm": 23.770775974779845, + "learning_rate": 8.352948041060234e-06, + "loss": 0.7378, + "step": 5990 + }, + { + "epoch": 1.02, + "grad_norm": 19.14504601941375, + "learning_rate": 8.349277673038026e-06, + "loss": 0.7465, + "step": 5995 + }, + { + "epoch": 1.02, + "grad_norm": 20.290936891320126, + "learning_rate": 8.345604028327134e-06, + "loss": 0.7314, + "step": 6000 + }, + { + "epoch": 1.02, + "grad_norm": 7.7028244206432515, + "learning_rate": 8.341927110521583e-06, + "loss": 0.7185, + "step": 6005 + }, + { + "epoch": 1.02, + "grad_norm": 10.604020058712843, + "learning_rate": 8.3382469232186e-06, + "loss": 0.7467, + "step": 6010 + }, + { + "epoch": 1.02, + "grad_norm": 56.19937286099307, + "learning_rate": 8.334563470018611e-06, + "loss": 0.7333, + "step": 6015 + }, + { + "epoch": 1.02, + "grad_norm": 26.771294283391278, + "learning_rate": 8.33087675452524e-06, + "loss": 0.7225, + "step": 6020 + }, + { + "epoch": 1.02, + "grad_norm": 11.734466397931051, + "learning_rate": 8.327186780345295e-06, + "loss": 0.7247, + "step": 6025 + }, + { + "epoch": 1.03, + "grad_norm": 33.72500753847307, + "learning_rate": 8.323493551088782e-06, + "loss": 0.7224, + "step": 6030 + }, + { + "epoch": 1.03, + "grad_norm": 16.90816584525726, + "learning_rate": 8.319797070368885e-06, + "loss": 0.7334, + "step": 6035 + }, + { + "epoch": 1.03, + "grad_norm": 11.719459783117856, + "learning_rate": 8.316097341801972e-06, + "loss": 0.732, + "step": 6040 + }, + { + "epoch": 1.03, + "grad_norm": 11.500384938602528, + "learning_rate": 8.312394369007586e-06, + "loss": 0.7176, + "step": 6045 + }, + { + "epoch": 1.03, + "grad_norm": 13.378871880562029, + "learning_rate": 8.308688155608446e-06, + "loss": 0.7264, + "step": 6050 + }, + { + "epoch": 1.03, + "grad_norm": 11.840820667726737, + "learning_rate": 8.30497870523044e-06, + "loss": 0.7267, + "step": 6055 + }, + { + "epoch": 1.03, + "grad_norm": 22.140046240955762, + "learning_rate": 8.301266021502622e-06, + "loss": 0.7344, + "step": 6060 + }, + { + "epoch": 1.03, + "grad_norm": 31.539045412586958, + "learning_rate": 8.297550108057213e-06, + "loss": 0.7417, + "step": 6065 + }, + { + "epoch": 1.03, + "grad_norm": 7.69385281998031, + "learning_rate": 8.293830968529592e-06, + "loss": 0.7461, + "step": 6070 + }, + { + "epoch": 1.03, + "grad_norm": 10.582764103392428, + "learning_rate": 8.29010860655829e-06, + "loss": 0.7292, + "step": 6075 + }, + { + "epoch": 1.03, + "grad_norm": 22.418600869543923, + "learning_rate": 8.286383025784997e-06, + "loss": 0.7311, + "step": 6080 + }, + { + "epoch": 1.03, + "grad_norm": 15.838293990902837, + "learning_rate": 8.282654229854547e-06, + "loss": 0.7406, + "step": 6085 + }, + { + "epoch": 1.04, + "grad_norm": 9.173142797379702, + "learning_rate": 8.278922222414924e-06, + "loss": 0.7331, + "step": 6090 + }, + { + "epoch": 1.04, + "grad_norm": 9.858068006200584, + "learning_rate": 8.275187007117251e-06, + "loss": 0.6926, + "step": 6095 + }, + { + "epoch": 1.04, + "grad_norm": 18.922773613667474, + "learning_rate": 8.27144858761579e-06, + "loss": 0.7242, + "step": 6100 + }, + { + "epoch": 1.04, + "grad_norm": 12.763980438898749, + "learning_rate": 8.267706967567935e-06, + "loss": 0.7223, + "step": 6105 + }, + { + "epoch": 1.04, + "grad_norm": 13.61708664822042, + "learning_rate": 8.263962150634215e-06, + "loss": 0.7126, + "step": 6110 + }, + { + "epoch": 1.04, + "grad_norm": 14.596927035396877, + "learning_rate": 8.26021414047829e-06, + "loss": 0.7192, + "step": 6115 + }, + { + "epoch": 1.04, + "grad_norm": 8.02870706457101, + "learning_rate": 8.256462940766932e-06, + "loss": 0.7286, + "step": 6120 + }, + { + "epoch": 1.04, + "grad_norm": 36.59195200291681, + "learning_rate": 8.252708555170044e-06, + "loss": 0.7096, + "step": 6125 + }, + { + "epoch": 1.04, + "grad_norm": 20.281469550974503, + "learning_rate": 8.248950987360645e-06, + "loss": 0.7167, + "step": 6130 + }, + { + "epoch": 1.04, + "grad_norm": 9.97594999145645, + "learning_rate": 8.245190241014863e-06, + "loss": 0.7021, + "step": 6135 + }, + { + "epoch": 1.04, + "grad_norm": 6.795171570462541, + "learning_rate": 8.241426319811938e-06, + "loss": 0.749, + "step": 6140 + }, + { + "epoch": 1.04, + "grad_norm": 13.792290293045824, + "learning_rate": 8.237659227434213e-06, + "loss": 0.7278, + "step": 6145 + }, + { + "epoch": 1.05, + "grad_norm": 7.983265764805912, + "learning_rate": 8.233888967567141e-06, + "loss": 0.7367, + "step": 6150 + }, + { + "epoch": 1.05, + "grad_norm": 18.662478937697124, + "learning_rate": 8.230115543899265e-06, + "loss": 0.713, + "step": 6155 + }, + { + "epoch": 1.05, + "grad_norm": 46.90862290358351, + "learning_rate": 8.226338960122229e-06, + "loss": 0.7281, + "step": 6160 + }, + { + "epoch": 1.05, + "grad_norm": 34.89234533567352, + "learning_rate": 8.222559219930766e-06, + "loss": 0.7333, + "step": 6165 + }, + { + "epoch": 1.05, + "grad_norm": 14.697726604622881, + "learning_rate": 8.218776327022696e-06, + "loss": 0.7464, + "step": 6170 + }, + { + "epoch": 1.05, + "grad_norm": 10.747423802614167, + "learning_rate": 8.214990285098931e-06, + "loss": 0.7376, + "step": 6175 + }, + { + "epoch": 1.05, + "grad_norm": 10.96902864680576, + "learning_rate": 8.211201097863452e-06, + "loss": 0.7287, + "step": 6180 + }, + { + "epoch": 1.05, + "grad_norm": 13.636950347264827, + "learning_rate": 8.207408769023324e-06, + "loss": 0.7094, + "step": 6185 + }, + { + "epoch": 1.05, + "grad_norm": 9.742565886062385, + "learning_rate": 8.203613302288689e-06, + "loss": 0.6964, + "step": 6190 + }, + { + "epoch": 1.05, + "grad_norm": 24.16036680358423, + "learning_rate": 8.199814701372748e-06, + "loss": 0.7124, + "step": 6195 + }, + { + "epoch": 1.05, + "grad_norm": 14.32299199634298, + "learning_rate": 8.19601296999178e-06, + "loss": 0.7255, + "step": 6200 + }, + { + "epoch": 1.05, + "grad_norm": 7.085040599511274, + "learning_rate": 8.192208111865118e-06, + "loss": 0.7077, + "step": 6205 + }, + { + "epoch": 1.06, + "grad_norm": 8.03400567894429, + "learning_rate": 8.188400130715159e-06, + "loss": 0.7118, + "step": 6210 + }, + { + "epoch": 1.06, + "grad_norm": 6.854524776829947, + "learning_rate": 8.184589030267353e-06, + "loss": 0.7329, + "step": 6215 + }, + { + "epoch": 1.06, + "grad_norm": 14.850945170965838, + "learning_rate": 8.180774814250204e-06, + "loss": 0.7138, + "step": 6220 + }, + { + "epoch": 1.06, + "grad_norm": 24.114471404232695, + "learning_rate": 8.17695748639526e-06, + "loss": 0.7213, + "step": 6225 + }, + { + "epoch": 1.06, + "grad_norm": 29.795134364882006, + "learning_rate": 8.173137050437118e-06, + "loss": 0.7127, + "step": 6230 + }, + { + "epoch": 1.06, + "grad_norm": 12.154362218077843, + "learning_rate": 8.169313510113413e-06, + "loss": 0.7324, + "step": 6235 + }, + { + "epoch": 1.06, + "grad_norm": 25.20196920703092, + "learning_rate": 8.165486869164814e-06, + "loss": 0.7124, + "step": 6240 + }, + { + "epoch": 1.06, + "grad_norm": 13.811823800608863, + "learning_rate": 8.161657131335033e-06, + "loss": 0.7104, + "step": 6245 + }, + { + "epoch": 1.06, + "grad_norm": 17.93546314393749, + "learning_rate": 8.157824300370801e-06, + "loss": 0.6941, + "step": 6250 + }, + { + "epoch": 1.06, + "grad_norm": 33.16849346210743, + "learning_rate": 8.153988380021881e-06, + "loss": 0.7347, + "step": 6255 + }, + { + "epoch": 1.06, + "grad_norm": 24.269651699560313, + "learning_rate": 8.150149374041061e-06, + "loss": 0.7379, + "step": 6260 + }, + { + "epoch": 1.07, + "grad_norm": 11.381379260049865, + "learning_rate": 8.146307286184141e-06, + "loss": 0.726, + "step": 6265 + }, + { + "epoch": 1.07, + "grad_norm": 12.343196756170697, + "learning_rate": 8.14246212020994e-06, + "loss": 0.7178, + "step": 6270 + }, + { + "epoch": 1.07, + "grad_norm": 7.578717720187098, + "learning_rate": 8.138613879880284e-06, + "loss": 0.7137, + "step": 6275 + }, + { + "epoch": 1.07, + "grad_norm": 31.370528526498276, + "learning_rate": 8.134762568960015e-06, + "loss": 0.7106, + "step": 6280 + }, + { + "epoch": 1.07, + "grad_norm": 13.249042174913429, + "learning_rate": 8.130908191216974e-06, + "loss": 0.7089, + "step": 6285 + }, + { + "epoch": 1.07, + "grad_norm": 23.809066454954635, + "learning_rate": 8.127050750422e-06, + "loss": 0.7045, + "step": 6290 + }, + { + "epoch": 1.07, + "grad_norm": 7.854982995546571, + "learning_rate": 8.123190250348932e-06, + "loss": 0.7164, + "step": 6295 + }, + { + "epoch": 1.07, + "grad_norm": 17.7265251355763, + "learning_rate": 8.119326694774602e-06, + "loss": 0.7012, + "step": 6300 + }, + { + "epoch": 1.07, + "grad_norm": 17.579992645590757, + "learning_rate": 8.115460087478833e-06, + "loss": 0.7404, + "step": 6305 + }, + { + "epoch": 1.07, + "grad_norm": 25.041819648349822, + "learning_rate": 8.11159043224443e-06, + "loss": 0.7015, + "step": 6310 + }, + { + "epoch": 1.07, + "grad_norm": 24.949195747447597, + "learning_rate": 8.107717732857177e-06, + "loss": 0.6872, + "step": 6315 + }, + { + "epoch": 1.07, + "grad_norm": 19.679411460396835, + "learning_rate": 8.103841993105843e-06, + "loss": 0.7072, + "step": 6320 + }, + { + "epoch": 1.08, + "grad_norm": 20.012872260255527, + "learning_rate": 8.099963216782171e-06, + "loss": 0.7031, + "step": 6325 + }, + { + "epoch": 1.08, + "grad_norm": 28.27758443192042, + "learning_rate": 8.09608140768087e-06, + "loss": 0.7259, + "step": 6330 + }, + { + "epoch": 1.08, + "grad_norm": 24.772908515131697, + "learning_rate": 8.092196569599619e-06, + "loss": 0.7219, + "step": 6335 + }, + { + "epoch": 1.08, + "grad_norm": 18.0503438246646, + "learning_rate": 8.08830870633906e-06, + "loss": 0.7279, + "step": 6340 + }, + { + "epoch": 1.08, + "grad_norm": 14.318031610520302, + "learning_rate": 8.084417821702796e-06, + "loss": 0.7082, + "step": 6345 + }, + { + "epoch": 1.08, + "grad_norm": 35.01375125895254, + "learning_rate": 8.080523919497381e-06, + "loss": 0.7282, + "step": 6350 + }, + { + "epoch": 1.08, + "grad_norm": 20.21136986925559, + "learning_rate": 8.076627003532328e-06, + "loss": 0.7057, + "step": 6355 + }, + { + "epoch": 1.08, + "grad_norm": 11.017121017069089, + "learning_rate": 8.072727077620092e-06, + "loss": 0.7136, + "step": 6360 + }, + { + "epoch": 1.08, + "grad_norm": 27.32813819885369, + "learning_rate": 8.068824145576077e-06, + "loss": 0.7499, + "step": 6365 + }, + { + "epoch": 1.08, + "grad_norm": 101.56111015961008, + "learning_rate": 8.064918211218628e-06, + "loss": 0.7211, + "step": 6370 + }, + { + "epoch": 1.08, + "grad_norm": 15.904740460926877, + "learning_rate": 8.061009278369026e-06, + "loss": 0.7587, + "step": 6375 + }, + { + "epoch": 1.08, + "grad_norm": 13.262944951104998, + "learning_rate": 8.05709735085148e-06, + "loss": 0.76, + "step": 6380 + }, + { + "epoch": 1.09, + "grad_norm": 19.900012124876444, + "learning_rate": 8.053182432493141e-06, + "loss": 0.7362, + "step": 6385 + }, + { + "epoch": 1.09, + "grad_norm": 10.601924165265274, + "learning_rate": 8.049264527124076e-06, + "loss": 0.7501, + "step": 6390 + }, + { + "epoch": 1.09, + "grad_norm": 21.08025285119702, + "learning_rate": 8.045343638577278e-06, + "loss": 0.7563, + "step": 6395 + }, + { + "epoch": 1.09, + "grad_norm": 20.646288499921937, + "learning_rate": 8.04141977068866e-06, + "loss": 0.7316, + "step": 6400 + }, + { + "epoch": 1.09, + "grad_norm": 9.096270740741641, + "learning_rate": 8.037492927297044e-06, + "loss": 0.7381, + "step": 6405 + }, + { + "epoch": 1.09, + "grad_norm": 8.580229233187346, + "learning_rate": 8.033563112244172e-06, + "loss": 0.7287, + "step": 6410 + }, + { + "epoch": 1.09, + "grad_norm": 18.28636073508511, + "learning_rate": 8.02963032937468e-06, + "loss": 0.7352, + "step": 6415 + }, + { + "epoch": 1.09, + "grad_norm": 8.346063474119852, + "learning_rate": 8.025694582536124e-06, + "loss": 0.7332, + "step": 6420 + }, + { + "epoch": 1.09, + "grad_norm": 10.246364151412761, + "learning_rate": 8.021755875578945e-06, + "loss": 0.71, + "step": 6425 + }, + { + "epoch": 1.09, + "grad_norm": 8.367025054003026, + "learning_rate": 8.017814212356492e-06, + "loss": 0.725, + "step": 6430 + }, + { + "epoch": 1.09, + "grad_norm": 8.714057040582556, + "learning_rate": 8.013869596724994e-06, + "loss": 0.7379, + "step": 6435 + }, + { + "epoch": 1.09, + "grad_norm": 8.593995045394287, + "learning_rate": 8.009922032543581e-06, + "loss": 0.7415, + "step": 6440 + }, + { + "epoch": 1.1, + "grad_norm": 26.587621305434546, + "learning_rate": 8.005971523674257e-06, + "loss": 0.7186, + "step": 6445 + }, + { + "epoch": 1.1, + "grad_norm": 27.706587902467838, + "learning_rate": 8.002018073981914e-06, + "loss": 0.7022, + "step": 6450 + }, + { + "epoch": 1.1, + "grad_norm": 21.224412769515773, + "learning_rate": 7.998061687334318e-06, + "loss": 0.7056, + "step": 6455 + }, + { + "epoch": 1.1, + "grad_norm": 14.643126663204766, + "learning_rate": 7.994102367602107e-06, + "loss": 0.7126, + "step": 6460 + }, + { + "epoch": 1.1, + "grad_norm": 8.286326181980531, + "learning_rate": 7.990140118658792e-06, + "loss": 0.7317, + "step": 6465 + }, + { + "epoch": 1.1, + "grad_norm": 26.733025687439078, + "learning_rate": 7.986174944380749e-06, + "loss": 0.7321, + "step": 6470 + }, + { + "epoch": 1.1, + "grad_norm": 19.564773455771654, + "learning_rate": 7.982206848647212e-06, + "loss": 0.7268, + "step": 6475 + }, + { + "epoch": 1.1, + "grad_norm": 17.448541273732022, + "learning_rate": 7.978235835340277e-06, + "loss": 0.744, + "step": 6480 + }, + { + "epoch": 1.1, + "grad_norm": 36.37395815875439, + "learning_rate": 7.974261908344896e-06, + "loss": 0.6985, + "step": 6485 + }, + { + "epoch": 1.1, + "grad_norm": 29.85056971539694, + "learning_rate": 7.970285071548868e-06, + "loss": 0.7214, + "step": 6490 + }, + { + "epoch": 1.1, + "grad_norm": 22.471371403927865, + "learning_rate": 7.966305328842838e-06, + "loss": 0.6981, + "step": 6495 + }, + { + "epoch": 1.11, + "grad_norm": 29.88202765503994, + "learning_rate": 7.962322684120295e-06, + "loss": 0.6977, + "step": 6500 + }, + { + "epoch": 1.11, + "grad_norm": 17.33714463691604, + "learning_rate": 7.95833714127757e-06, + "loss": 0.7451, + "step": 6505 + }, + { + "epoch": 1.11, + "grad_norm": 12.99959408887192, + "learning_rate": 7.954348704213825e-06, + "loss": 0.7204, + "step": 6510 + }, + { + "epoch": 1.11, + "grad_norm": 10.049862478420733, + "learning_rate": 7.95035737683106e-06, + "loss": 0.7168, + "step": 6515 + }, + { + "epoch": 1.11, + "grad_norm": 14.518802904427606, + "learning_rate": 7.946363163034092e-06, + "loss": 0.7031, + "step": 6520 + }, + { + "epoch": 1.11, + "grad_norm": 8.273603677482322, + "learning_rate": 7.942366066730571e-06, + "loss": 0.7324, + "step": 6525 + }, + { + "epoch": 1.11, + "grad_norm": 22.046757543422785, + "learning_rate": 7.938366091830967e-06, + "loss": 0.7265, + "step": 6530 + }, + { + "epoch": 1.11, + "grad_norm": 33.12928987146557, + "learning_rate": 7.93436324224856e-06, + "loss": 0.7145, + "step": 6535 + }, + { + "epoch": 1.11, + "grad_norm": 12.56350041819999, + "learning_rate": 7.930357521899444e-06, + "loss": 0.7149, + "step": 6540 + }, + { + "epoch": 1.11, + "grad_norm": 12.501911722744474, + "learning_rate": 7.926348934702526e-06, + "loss": 0.6998, + "step": 6545 + }, + { + "epoch": 1.11, + "grad_norm": 13.688733562201573, + "learning_rate": 7.922337484579516e-06, + "loss": 0.7107, + "step": 6550 + }, + { + "epoch": 1.11, + "grad_norm": 21.058453052713165, + "learning_rate": 7.918323175454923e-06, + "loss": 0.7332, + "step": 6555 + }, + { + "epoch": 1.12, + "grad_norm": 38.17079743809706, + "learning_rate": 7.914306011256051e-06, + "loss": 0.6928, + "step": 6560 + }, + { + "epoch": 1.12, + "grad_norm": 67.58525110577213, + "learning_rate": 7.910285995913003e-06, + "loss": 0.7347, + "step": 6565 + }, + { + "epoch": 1.12, + "grad_norm": 32.44302274542706, + "learning_rate": 7.90626313335867e-06, + "loss": 0.7164, + "step": 6570 + }, + { + "epoch": 1.12, + "grad_norm": 20.402541349672042, + "learning_rate": 7.902237427528721e-06, + "loss": 0.7061, + "step": 6575 + }, + { + "epoch": 1.12, + "grad_norm": 11.055494830630643, + "learning_rate": 7.89820888236162e-06, + "loss": 0.7074, + "step": 6580 + }, + { + "epoch": 1.12, + "grad_norm": 12.66556604728358, + "learning_rate": 7.894177501798595e-06, + "loss": 0.7163, + "step": 6585 + }, + { + "epoch": 1.12, + "grad_norm": 9.07830415040536, + "learning_rate": 7.890143289783658e-06, + "loss": 0.7098, + "step": 6590 + }, + { + "epoch": 1.12, + "grad_norm": 12.736714344164882, + "learning_rate": 7.886106250263588e-06, + "loss": 0.7215, + "step": 6595 + }, + { + "epoch": 1.12, + "grad_norm": 16.6673812587513, + "learning_rate": 7.882066387187926e-06, + "loss": 0.7087, + "step": 6600 + }, + { + "epoch": 1.12, + "grad_norm": 27.162715320340062, + "learning_rate": 7.87802370450898e-06, + "loss": 0.703, + "step": 6605 + }, + { + "epoch": 1.12, + "grad_norm": 17.595778017849216, + "learning_rate": 7.87397820618182e-06, + "loss": 0.732, + "step": 6610 + }, + { + "epoch": 1.12, + "grad_norm": 7.497247310820503, + "learning_rate": 7.869929896164262e-06, + "loss": 0.6938, + "step": 6615 + }, + { + "epoch": 1.13, + "grad_norm": 24.174550408631685, + "learning_rate": 7.865878778416879e-06, + "loss": 0.7326, + "step": 6620 + }, + { + "epoch": 1.13, + "grad_norm": 9.682620191492965, + "learning_rate": 7.861824856902984e-06, + "loss": 0.681, + "step": 6625 + }, + { + "epoch": 1.13, + "grad_norm": 7.042295074217735, + "learning_rate": 7.857768135588642e-06, + "loss": 0.7225, + "step": 6630 + }, + { + "epoch": 1.13, + "grad_norm": 23.16669446157789, + "learning_rate": 7.853708618442654e-06, + "loss": 0.7164, + "step": 6635 + }, + { + "epoch": 1.13, + "grad_norm": 7.921927166023332, + "learning_rate": 7.849646309436551e-06, + "loss": 0.6961, + "step": 6640 + }, + { + "epoch": 1.13, + "grad_norm": 11.13787412696501, + "learning_rate": 7.845581212544605e-06, + "loss": 0.7325, + "step": 6645 + }, + { + "epoch": 1.13, + "grad_norm": 11.317276381618884, + "learning_rate": 7.841513331743803e-06, + "loss": 0.7118, + "step": 6650 + }, + { + "epoch": 1.13, + "grad_norm": 13.678166525281911, + "learning_rate": 7.837442671013868e-06, + "loss": 0.7186, + "step": 6655 + }, + { + "epoch": 1.13, + "grad_norm": 10.039215360240258, + "learning_rate": 7.833369234337235e-06, + "loss": 0.7159, + "step": 6660 + }, + { + "epoch": 1.13, + "grad_norm": 12.15585567306351, + "learning_rate": 7.829293025699056e-06, + "loss": 0.72, + "step": 6665 + }, + { + "epoch": 1.13, + "grad_norm": 7.964575038033085, + "learning_rate": 7.8252140490872e-06, + "loss": 0.6964, + "step": 6670 + }, + { + "epoch": 1.13, + "grad_norm": 31.392270596686796, + "learning_rate": 7.821132308492235e-06, + "loss": 0.725, + "step": 6675 + }, + { + "epoch": 1.14, + "grad_norm": 22.243486431851114, + "learning_rate": 7.81704780790744e-06, + "loss": 0.7163, + "step": 6680 + }, + { + "epoch": 1.14, + "grad_norm": 32.554006910354886, + "learning_rate": 7.812960551328792e-06, + "loss": 0.7215, + "step": 6685 + }, + { + "epoch": 1.14, + "grad_norm": 11.860686728562028, + "learning_rate": 7.808870542754964e-06, + "loss": 0.7126, + "step": 6690 + }, + { + "epoch": 1.14, + "grad_norm": 14.717346410772016, + "learning_rate": 7.804777786187324e-06, + "loss": 0.7107, + "step": 6695 + }, + { + "epoch": 1.14, + "grad_norm": 10.488764788651848, + "learning_rate": 7.800682285629922e-06, + "loss": 0.7108, + "step": 6700 + }, + { + "epoch": 1.14, + "grad_norm": 20.458748569916793, + "learning_rate": 7.796584045089499e-06, + "loss": 0.7195, + "step": 6705 + }, + { + "epoch": 1.14, + "grad_norm": 15.795348738415434, + "learning_rate": 7.792483068575475e-06, + "loss": 0.7119, + "step": 6710 + }, + { + "epoch": 1.14, + "grad_norm": 24.613194371981375, + "learning_rate": 7.788379360099944e-06, + "loss": 0.7097, + "step": 6715 + }, + { + "epoch": 1.14, + "grad_norm": 16.381433477062384, + "learning_rate": 7.784272923677678e-06, + "loss": 0.6991, + "step": 6720 + }, + { + "epoch": 1.14, + "grad_norm": 12.110548394689014, + "learning_rate": 7.78016376332611e-06, + "loss": 0.7046, + "step": 6725 + }, + { + "epoch": 1.14, + "grad_norm": 15.048488403438107, + "learning_rate": 7.776051883065345e-06, + "loss": 0.7019, + "step": 6730 + }, + { + "epoch": 1.15, + "grad_norm": 29.011311668426053, + "learning_rate": 7.771937286918147e-06, + "loss": 0.7244, + "step": 6735 + }, + { + "epoch": 1.15, + "grad_norm": 28.93610413762296, + "learning_rate": 7.767819978909933e-06, + "loss": 0.6931, + "step": 6740 + }, + { + "epoch": 1.15, + "grad_norm": 8.355017448756705, + "learning_rate": 7.763699963068782e-06, + "loss": 0.7062, + "step": 6745 + }, + { + "epoch": 1.15, + "grad_norm": 7.110861457861793, + "learning_rate": 7.759577243425412e-06, + "loss": 0.6952, + "step": 6750 + }, + { + "epoch": 1.15, + "grad_norm": 12.727622562247888, + "learning_rate": 7.755451824013194e-06, + "loss": 0.7046, + "step": 6755 + }, + { + "epoch": 1.15, + "grad_norm": 25.70418981169325, + "learning_rate": 7.751323708868134e-06, + "loss": 0.683, + "step": 6760 + }, + { + "epoch": 1.15, + "grad_norm": 25.78166214567401, + "learning_rate": 7.74719290202888e-06, + "loss": 0.6869, + "step": 6765 + }, + { + "epoch": 1.15, + "grad_norm": 9.622497117699039, + "learning_rate": 7.74305940753671e-06, + "loss": 0.6902, + "step": 6770 + }, + { + "epoch": 1.15, + "grad_norm": 39.78003913153285, + "learning_rate": 7.738923229435538e-06, + "loss": 0.7013, + "step": 6775 + }, + { + "epoch": 1.15, + "grad_norm": 45.810278472194724, + "learning_rate": 7.734784371771894e-06, + "loss": 0.7171, + "step": 6780 + }, + { + "epoch": 1.15, + "grad_norm": 23.054568253500427, + "learning_rate": 7.730642838594932e-06, + "loss": 0.692, + "step": 6785 + }, + { + "epoch": 1.15, + "grad_norm": 10.059784782634388, + "learning_rate": 7.726498633956433e-06, + "loss": 0.6867, + "step": 6790 + }, + { + "epoch": 1.16, + "grad_norm": 9.177875706336556, + "learning_rate": 7.72235176191078e-06, + "loss": 0.7012, + "step": 6795 + }, + { + "epoch": 1.16, + "grad_norm": 7.4495938243906625, + "learning_rate": 7.718202226514968e-06, + "loss": 0.7038, + "step": 6800 + }, + { + "epoch": 1.16, + "grad_norm": 10.44562019083214, + "learning_rate": 7.714050031828602e-06, + "loss": 0.6935, + "step": 6805 + }, + { + "epoch": 1.16, + "grad_norm": 7.812788788197329, + "learning_rate": 7.709895181913887e-06, + "loss": 0.725, + "step": 6810 + }, + { + "epoch": 1.16, + "grad_norm": 9.336833846004621, + "learning_rate": 7.705737680835623e-06, + "loss": 0.6925, + "step": 6815 + }, + { + "epoch": 1.16, + "grad_norm": 8.987925326739377, + "learning_rate": 7.701577532661204e-06, + "loss": 0.6947, + "step": 6820 + }, + { + "epoch": 1.16, + "grad_norm": 12.87671922318729, + "learning_rate": 7.697414741460616e-06, + "loss": 0.687, + "step": 6825 + }, + { + "epoch": 1.16, + "grad_norm": 7.712200059941566, + "learning_rate": 7.693249311306433e-06, + "loss": 0.6904, + "step": 6830 + }, + { + "epoch": 1.16, + "grad_norm": 7.774310159518635, + "learning_rate": 7.689081246273805e-06, + "loss": 0.6892, + "step": 6835 + }, + { + "epoch": 1.16, + "grad_norm": 18.661289555094946, + "learning_rate": 7.684910550440462e-06, + "loss": 0.6999, + "step": 6840 + }, + { + "epoch": 1.16, + "grad_norm": 33.664597069197804, + "learning_rate": 7.680737227886708e-06, + "loss": 0.7047, + "step": 6845 + }, + { + "epoch": 1.16, + "grad_norm": 20.181955771404265, + "learning_rate": 7.67656128269542e-06, + "loss": 0.7261, + "step": 6850 + }, + { + "epoch": 1.17, + "grad_norm": 30.26147343843953, + "learning_rate": 7.672382718952037e-06, + "loss": 0.7052, + "step": 6855 + }, + { + "epoch": 1.17, + "grad_norm": 20.667253974389084, + "learning_rate": 7.668201540744556e-06, + "loss": 0.711, + "step": 6860 + }, + { + "epoch": 1.17, + "grad_norm": 42.630822600609775, + "learning_rate": 7.664017752163542e-06, + "loss": 0.7036, + "step": 6865 + }, + { + "epoch": 1.17, + "grad_norm": 16.345967377044545, + "learning_rate": 7.659831357302105e-06, + "loss": 0.6998, + "step": 6870 + }, + { + "epoch": 1.17, + "grad_norm": 7.85298791802671, + "learning_rate": 7.655642360255911e-06, + "loss": 0.6963, + "step": 6875 + }, + { + "epoch": 1.17, + "grad_norm": 15.02513471044103, + "learning_rate": 7.651450765123165e-06, + "loss": 0.6913, + "step": 6880 + }, + { + "epoch": 1.17, + "grad_norm": 21.157869645277824, + "learning_rate": 7.647256576004619e-06, + "loss": 0.6693, + "step": 6885 + }, + { + "epoch": 1.17, + "grad_norm": 22.99490929473168, + "learning_rate": 7.643059797003564e-06, + "loss": 0.6849, + "step": 6890 + }, + { + "epoch": 1.17, + "grad_norm": 27.020744662631394, + "learning_rate": 7.638860432225818e-06, + "loss": 0.723, + "step": 6895 + }, + { + "epoch": 1.17, + "grad_norm": 7.624030473466098, + "learning_rate": 7.634658485779736e-06, + "loss": 0.6946, + "step": 6900 + }, + { + "epoch": 1.17, + "grad_norm": 9.65357246747531, + "learning_rate": 7.630453961776195e-06, + "loss": 0.6892, + "step": 6905 + }, + { + "epoch": 1.17, + "grad_norm": 12.093852140081328, + "learning_rate": 7.6262468643285926e-06, + "loss": 0.6927, + "step": 6910 + }, + { + "epoch": 1.18, + "grad_norm": 13.094869879189778, + "learning_rate": 7.622037197552846e-06, + "loss": 0.6913, + "step": 6915 + }, + { + "epoch": 1.18, + "grad_norm": 11.165975514550444, + "learning_rate": 7.617824965567387e-06, + "loss": 0.7019, + "step": 6920 + }, + { + "epoch": 1.18, + "grad_norm": 19.50880196493642, + "learning_rate": 7.613610172493156e-06, + "loss": 0.7077, + "step": 6925 + }, + { + "epoch": 1.18, + "grad_norm": 11.672651849911297, + "learning_rate": 7.609392822453596e-06, + "loss": 0.7017, + "step": 6930 + }, + { + "epoch": 1.18, + "grad_norm": 10.164537307950836, + "learning_rate": 7.605172919574657e-06, + "loss": 0.7015, + "step": 6935 + }, + { + "epoch": 1.18, + "grad_norm": 7.451099827206772, + "learning_rate": 7.600950467984783e-06, + "loss": 0.6752, + "step": 6940 + }, + { + "epoch": 1.18, + "grad_norm": 16.870539162624986, + "learning_rate": 7.5967254718149145e-06, + "loss": 0.6853, + "step": 6945 + }, + { + "epoch": 1.18, + "grad_norm": 15.135503893996802, + "learning_rate": 7.592497935198474e-06, + "loss": 0.6946, + "step": 6950 + }, + { + "epoch": 1.18, + "grad_norm": 9.045389341400304, + "learning_rate": 7.588267862271379e-06, + "loss": 0.7116, + "step": 6955 + }, + { + "epoch": 1.18, + "grad_norm": 7.973021024474602, + "learning_rate": 7.584035257172022e-06, + "loss": 0.7072, + "step": 6960 + }, + { + "epoch": 1.18, + "grad_norm": 17.220087665017225, + "learning_rate": 7.579800124041276e-06, + "loss": 0.6944, + "step": 6965 + }, + { + "epoch": 1.18, + "grad_norm": 39.31130380483513, + "learning_rate": 7.575562467022484e-06, + "loss": 0.701, + "step": 6970 + }, + { + "epoch": 1.19, + "grad_norm": 33.3590421783122, + "learning_rate": 7.571322290261462e-06, + "loss": 0.7133, + "step": 6975 + }, + { + "epoch": 1.19, + "grad_norm": 43.2227104390999, + "learning_rate": 7.567079597906491e-06, + "loss": 0.7223, + "step": 6980 + }, + { + "epoch": 1.19, + "grad_norm": 52.20311146641715, + "learning_rate": 7.5628343941083074e-06, + "loss": 0.7041, + "step": 6985 + }, + { + "epoch": 1.19, + "grad_norm": 69.35024918310937, + "learning_rate": 7.5585866830201086e-06, + "loss": 0.6854, + "step": 6990 + }, + { + "epoch": 1.19, + "grad_norm": 26.03096173960646, + "learning_rate": 7.554336468797549e-06, + "loss": 0.6967, + "step": 6995 + }, + { + "epoch": 1.19, + "grad_norm": 18.747268228543884, + "learning_rate": 7.550083755598723e-06, + "loss": 0.6989, + "step": 7000 + }, + { + "epoch": 1.19, + "grad_norm": 33.98837608030512, + "learning_rate": 7.5458285475841784e-06, + "loss": 0.6988, + "step": 7005 + }, + { + "epoch": 1.19, + "grad_norm": 30.343042257903118, + "learning_rate": 7.541570848916898e-06, + "loss": 0.6924, + "step": 7010 + }, + { + "epoch": 1.19, + "grad_norm": 20.01218820770515, + "learning_rate": 7.537310663762305e-06, + "loss": 0.6964, + "step": 7015 + }, + { + "epoch": 1.19, + "grad_norm": 7.42358496227341, + "learning_rate": 7.533047996288252e-06, + "loss": 0.6913, + "step": 7020 + }, + { + "epoch": 1.19, + "grad_norm": 12.964543921746909, + "learning_rate": 7.528782850665021e-06, + "loss": 0.7022, + "step": 7025 + }, + { + "epoch": 1.2, + "grad_norm": 25.8037225474705, + "learning_rate": 7.524515231065321e-06, + "loss": 0.6785, + "step": 7030 + }, + { + "epoch": 1.2, + "grad_norm": 9.350765998841295, + "learning_rate": 7.520245141664278e-06, + "loss": 0.6869, + "step": 7035 + }, + { + "epoch": 1.2, + "grad_norm": 8.861735010461002, + "learning_rate": 7.515972586639435e-06, + "loss": 0.6799, + "step": 7040 + }, + { + "epoch": 1.2, + "grad_norm": 18.877217521620906, + "learning_rate": 7.511697570170748e-06, + "loss": 0.7014, + "step": 7045 + }, + { + "epoch": 1.2, + "grad_norm": 7.505481993998394, + "learning_rate": 7.507420096440583e-06, + "loss": 0.6984, + "step": 7050 + }, + { + "epoch": 1.2, + "grad_norm": 6.778863037826613, + "learning_rate": 7.5031401696337066e-06, + "loss": 0.6979, + "step": 7055 + }, + { + "epoch": 1.2, + "grad_norm": 11.471077209038233, + "learning_rate": 7.498857793937286e-06, + "loss": 0.6974, + "step": 7060 + }, + { + "epoch": 1.2, + "grad_norm": 36.567832461479874, + "learning_rate": 7.494572973540886e-06, + "loss": 0.6899, + "step": 7065 + }, + { + "epoch": 1.2, + "grad_norm": 18.43230031875812, + "learning_rate": 7.4902857126364605e-06, + "loss": 0.6821, + "step": 7070 + }, + { + "epoch": 1.2, + "grad_norm": 24.623961517100078, + "learning_rate": 7.485996015418354e-06, + "loss": 0.6922, + "step": 7075 + }, + { + "epoch": 1.2, + "grad_norm": 8.646515546307073, + "learning_rate": 7.481703886083291e-06, + "loss": 0.6745, + "step": 7080 + }, + { + "epoch": 1.2, + "grad_norm": 31.396337867305935, + "learning_rate": 7.477409328830381e-06, + "loss": 0.6928, + "step": 7085 + }, + { + "epoch": 1.21, + "grad_norm": 20.424188079541075, + "learning_rate": 7.473112347861103e-06, + "loss": 0.6722, + "step": 7090 + }, + { + "epoch": 1.21, + "grad_norm": 26.00232330630306, + "learning_rate": 7.468812947379307e-06, + "loss": 0.6998, + "step": 7095 + }, + { + "epoch": 1.21, + "grad_norm": 27.46957886840992, + "learning_rate": 7.464511131591216e-06, + "loss": 0.6846, + "step": 7100 + }, + { + "epoch": 1.21, + "grad_norm": 9.132272963335847, + "learning_rate": 7.4602069047054105e-06, + "loss": 0.6982, + "step": 7105 + }, + { + "epoch": 1.21, + "grad_norm": 22.348386685560573, + "learning_rate": 7.4559002709328335e-06, + "loss": 0.6851, + "step": 7110 + }, + { + "epoch": 1.21, + "grad_norm": 17.814202412904162, + "learning_rate": 7.451591234486779e-06, + "loss": 0.7055, + "step": 7115 + }, + { + "epoch": 1.21, + "grad_norm": 12.859238788081717, + "learning_rate": 7.447279799582895e-06, + "loss": 0.6921, + "step": 7120 + }, + { + "epoch": 1.21, + "grad_norm": 14.588006740632137, + "learning_rate": 7.442965970439175e-06, + "loss": 0.6588, + "step": 7125 + }, + { + "epoch": 1.21, + "grad_norm": 16.47004331535925, + "learning_rate": 7.438649751275952e-06, + "loss": 0.6859, + "step": 7130 + }, + { + "epoch": 1.21, + "grad_norm": 17.497100321839717, + "learning_rate": 7.434331146315903e-06, + "loss": 0.7088, + "step": 7135 + }, + { + "epoch": 1.21, + "grad_norm": 16.389468678247884, + "learning_rate": 7.430010159784032e-06, + "loss": 0.6818, + "step": 7140 + }, + { + "epoch": 1.21, + "grad_norm": 35.137652159527434, + "learning_rate": 7.425686795907678e-06, + "loss": 0.6927, + "step": 7145 + }, + { + "epoch": 1.22, + "grad_norm": 24.705252151791072, + "learning_rate": 7.4213610589165055e-06, + "loss": 0.7036, + "step": 7150 + }, + { + "epoch": 1.22, + "grad_norm": 10.207253216482892, + "learning_rate": 7.4170329530425e-06, + "loss": 0.6846, + "step": 7155 + }, + { + "epoch": 1.22, + "grad_norm": 6.846196135712745, + "learning_rate": 7.412702482519962e-06, + "loss": 0.6822, + "step": 7160 + }, + { + "epoch": 1.22, + "grad_norm": 17.97292030080821, + "learning_rate": 7.40836965158551e-06, + "loss": 0.6978, + "step": 7165 + }, + { + "epoch": 1.22, + "grad_norm": 21.994990939537736, + "learning_rate": 7.404034464478069e-06, + "loss": 0.7095, + "step": 7170 + }, + { + "epoch": 1.22, + "grad_norm": 12.622262894881901, + "learning_rate": 7.399696925438868e-06, + "loss": 0.6591, + "step": 7175 + }, + { + "epoch": 1.22, + "grad_norm": 17.138206921967175, + "learning_rate": 7.39535703871144e-06, + "loss": 0.6758, + "step": 7180 + }, + { + "epoch": 1.22, + "grad_norm": 7.35115037571887, + "learning_rate": 7.391014808541611e-06, + "loss": 0.6926, + "step": 7185 + }, + { + "epoch": 1.22, + "grad_norm": 11.995736156812953, + "learning_rate": 7.386670239177504e-06, + "loss": 0.6809, + "step": 7190 + }, + { + "epoch": 1.22, + "grad_norm": 17.45647494417984, + "learning_rate": 7.382323334869529e-06, + "loss": 0.6822, + "step": 7195 + }, + { + "epoch": 1.22, + "grad_norm": 24.7016547776482, + "learning_rate": 7.377974099870378e-06, + "loss": 0.6785, + "step": 7200 + }, + { + "epoch": 1.22, + "grad_norm": 23.36867960192761, + "learning_rate": 7.373622538435024e-06, + "loss": 0.6808, + "step": 7205 + }, + { + "epoch": 1.23, + "grad_norm": 32.65941997390678, + "learning_rate": 7.369268654820718e-06, + "loss": 0.6907, + "step": 7210 + }, + { + "epoch": 1.23, + "grad_norm": 19.77258580914987, + "learning_rate": 7.3649124532869855e-06, + "loss": 0.702, + "step": 7215 + }, + { + "epoch": 1.23, + "grad_norm": 13.899504710148166, + "learning_rate": 7.36055393809561e-06, + "loss": 0.684, + "step": 7220 + }, + { + "epoch": 1.23, + "grad_norm": 11.847634728164552, + "learning_rate": 7.356193113510648e-06, + "loss": 0.687, + "step": 7225 + }, + { + "epoch": 1.23, + "grad_norm": 20.93469198208261, + "learning_rate": 7.3518299837984095e-06, + "loss": 0.6607, + "step": 7230 + }, + { + "epoch": 1.23, + "grad_norm": 44.311453507879676, + "learning_rate": 7.347464553227466e-06, + "loss": 0.6924, + "step": 7235 + }, + { + "epoch": 1.23, + "grad_norm": 16.396465443143537, + "learning_rate": 7.343096826068631e-06, + "loss": 0.7023, + "step": 7240 + }, + { + "epoch": 1.23, + "grad_norm": 17.15469275451865, + "learning_rate": 7.338726806594973e-06, + "loss": 0.684, + "step": 7245 + }, + { + "epoch": 1.23, + "grad_norm": 17.59304430753221, + "learning_rate": 7.334354499081797e-06, + "loss": 0.7031, + "step": 7250 + }, + { + "epoch": 1.23, + "grad_norm": 15.2512260575335, + "learning_rate": 7.329979907806655e-06, + "loss": 0.6959, + "step": 7255 + }, + { + "epoch": 1.23, + "grad_norm": 8.326978416349075, + "learning_rate": 7.3256030370493216e-06, + "loss": 0.6902, + "step": 7260 + }, + { + "epoch": 1.24, + "grad_norm": 12.110584046270967, + "learning_rate": 7.321223891091811e-06, + "loss": 0.6801, + "step": 7265 + }, + { + "epoch": 1.24, + "grad_norm": 7.953504528002581, + "learning_rate": 7.316842474218357e-06, + "loss": 0.6851, + "step": 7270 + }, + { + "epoch": 1.24, + "grad_norm": 16.575325837437997, + "learning_rate": 7.312458790715423e-06, + "loss": 0.6897, + "step": 7275 + }, + { + "epoch": 1.24, + "grad_norm": 21.78543946475332, + "learning_rate": 7.308072844871679e-06, + "loss": 0.6691, + "step": 7280 + }, + { + "epoch": 1.24, + "grad_norm": 14.868152052821847, + "learning_rate": 7.3036846409780175e-06, + "loss": 0.6942, + "step": 7285 + }, + { + "epoch": 1.24, + "grad_norm": 13.721946329731745, + "learning_rate": 7.299294183327534e-06, + "loss": 0.6692, + "step": 7290 + }, + { + "epoch": 1.24, + "grad_norm": 7.006162810748249, + "learning_rate": 7.294901476215537e-06, + "loss": 0.6773, + "step": 7295 + }, + { + "epoch": 1.24, + "grad_norm": 8.387648611571954, + "learning_rate": 7.290506523939524e-06, + "loss": 0.6833, + "step": 7300 + }, + { + "epoch": 1.24, + "grad_norm": 18.110771012553165, + "learning_rate": 7.286109330799198e-06, + "loss": 0.6709, + "step": 7305 + }, + { + "epoch": 1.24, + "grad_norm": 8.516727078372885, + "learning_rate": 7.2817099010964545e-06, + "loss": 0.6775, + "step": 7310 + }, + { + "epoch": 1.24, + "grad_norm": 38.802526080079346, + "learning_rate": 7.277308239135369e-06, + "loss": 0.7053, + "step": 7315 + }, + { + "epoch": 1.24, + "grad_norm": 32.465664109302125, + "learning_rate": 7.27290434922221e-06, + "loss": 0.6796, + "step": 7320 + }, + { + "epoch": 1.25, + "grad_norm": 14.845671379668499, + "learning_rate": 7.26849823566542e-06, + "loss": 0.7167, + "step": 7325 + }, + { + "epoch": 1.25, + "grad_norm": 17.518712361079984, + "learning_rate": 7.264089902775618e-06, + "loss": 0.7029, + "step": 7330 + }, + { + "epoch": 1.25, + "grad_norm": 11.250471903999252, + "learning_rate": 7.259679354865599e-06, + "loss": 0.6757, + "step": 7335 + }, + { + "epoch": 1.25, + "grad_norm": 13.190619690981686, + "learning_rate": 7.255266596250316e-06, + "loss": 0.6919, + "step": 7340 + }, + { + "epoch": 1.25, + "grad_norm": 25.288326771663677, + "learning_rate": 7.250851631246892e-06, + "loss": 0.6594, + "step": 7345 + }, + { + "epoch": 1.25, + "grad_norm": 6.247667804652537, + "learning_rate": 7.246434464174607e-06, + "loss": 0.6532, + "step": 7350 + }, + { + "epoch": 1.25, + "grad_norm": 14.682710149971358, + "learning_rate": 7.242015099354894e-06, + "loss": 0.6585, + "step": 7355 + }, + { + "epoch": 1.25, + "grad_norm": 27.239771872923306, + "learning_rate": 7.2375935411113375e-06, + "loss": 0.6923, + "step": 7360 + }, + { + "epoch": 1.25, + "grad_norm": 12.142724413466496, + "learning_rate": 7.2331697937696654e-06, + "loss": 0.6701, + "step": 7365 + }, + { + "epoch": 1.25, + "grad_norm": 19.053316941159274, + "learning_rate": 7.2287438616577496e-06, + "loss": 0.6655, + "step": 7370 + }, + { + "epoch": 1.25, + "grad_norm": 13.044116655990347, + "learning_rate": 7.2243157491056e-06, + "loss": 0.6751, + "step": 7375 + }, + { + "epoch": 1.25, + "grad_norm": 23.211289117451855, + "learning_rate": 7.219885460445358e-06, + "loss": 0.6909, + "step": 7380 + }, + { + "epoch": 1.26, + "grad_norm": 19.716287423668646, + "learning_rate": 7.215453000011295e-06, + "loss": 0.6851, + "step": 7385 + }, + { + "epoch": 1.26, + "grad_norm": 7.994891603259087, + "learning_rate": 7.211018372139804e-06, + "loss": 0.6716, + "step": 7390 + }, + { + "epoch": 1.26, + "grad_norm": 7.834328037352239, + "learning_rate": 7.2065815811694055e-06, + "loss": 0.6806, + "step": 7395 + }, + { + "epoch": 1.26, + "grad_norm": 6.668936421997721, + "learning_rate": 7.202142631440728e-06, + "loss": 0.6895, + "step": 7400 + }, + { + "epoch": 1.26, + "grad_norm": 13.712019148611033, + "learning_rate": 7.197701527296518e-06, + "loss": 0.6713, + "step": 7405 + }, + { + "epoch": 1.26, + "grad_norm": 7.575949154287882, + "learning_rate": 7.193258273081626e-06, + "loss": 0.6635, + "step": 7410 + }, + { + "epoch": 1.26, + "grad_norm": 6.801693793087312, + "learning_rate": 7.188812873143007e-06, + "loss": 0.6769, + "step": 7415 + }, + { + "epoch": 1.26, + "grad_norm": 12.726495724148158, + "learning_rate": 7.184365331829719e-06, + "loss": 0.6822, + "step": 7420 + }, + { + "epoch": 1.26, + "grad_norm": 11.831922759330421, + "learning_rate": 7.179915653492907e-06, + "loss": 0.6427, + "step": 7425 + }, + { + "epoch": 1.26, + "grad_norm": 19.151607114346472, + "learning_rate": 7.175463842485815e-06, + "loss": 0.6901, + "step": 7430 + }, + { + "epoch": 1.26, + "grad_norm": 34.95697762852432, + "learning_rate": 7.171009903163767e-06, + "loss": 0.6907, + "step": 7435 + }, + { + "epoch": 1.26, + "grad_norm": 20.584668811325557, + "learning_rate": 7.1665538398841746e-06, + "loss": 0.6821, + "step": 7440 + }, + { + "epoch": 1.27, + "grad_norm": 16.792833479274165, + "learning_rate": 7.162095657006523e-06, + "loss": 0.6822, + "step": 7445 + }, + { + "epoch": 1.27, + "grad_norm": 32.94215443286715, + "learning_rate": 7.157635358892374e-06, + "loss": 0.6733, + "step": 7450 + }, + { + "epoch": 1.27, + "grad_norm": 46.902507165291105, + "learning_rate": 7.153172949905357e-06, + "loss": 0.685, + "step": 7455 + }, + { + "epoch": 1.27, + "grad_norm": 37.05914931544486, + "learning_rate": 7.1487084344111665e-06, + "loss": 0.7108, + "step": 7460 + }, + { + "epoch": 1.27, + "grad_norm": 7.901869763596967, + "learning_rate": 7.144241816777559e-06, + "loss": 0.678, + "step": 7465 + }, + { + "epoch": 1.27, + "grad_norm": 31.0258209958262, + "learning_rate": 7.139773101374346e-06, + "loss": 0.6655, + "step": 7470 + }, + { + "epoch": 1.27, + "grad_norm": 13.571638831792662, + "learning_rate": 7.135302292573392e-06, + "loss": 0.6815, + "step": 7475 + }, + { + "epoch": 1.27, + "grad_norm": 14.721170462197327, + "learning_rate": 7.130829394748613e-06, + "loss": 0.6796, + "step": 7480 + }, + { + "epoch": 1.27, + "grad_norm": 28.939363425798188, + "learning_rate": 7.126354412275963e-06, + "loss": 0.6667, + "step": 7485 + }, + { + "epoch": 1.27, + "grad_norm": 24.334434659753086, + "learning_rate": 7.121877349533438e-06, + "loss": 0.6793, + "step": 7490 + }, + { + "epoch": 1.27, + "grad_norm": 17.334053700215673, + "learning_rate": 7.117398210901071e-06, + "loss": 0.6796, + "step": 7495 + }, + { + "epoch": 1.28, + "grad_norm": 13.658342801586011, + "learning_rate": 7.112917000760923e-06, + "loss": 0.668, + "step": 7500 + }, + { + "epoch": 1.28, + "grad_norm": 7.31412533720228, + "learning_rate": 7.108433723497081e-06, + "loss": 0.6665, + "step": 7505 + }, + { + "epoch": 1.28, + "grad_norm": 13.405988971024291, + "learning_rate": 7.103948383495659e-06, + "loss": 0.6748, + "step": 7510 + }, + { + "epoch": 1.28, + "grad_norm": 18.334245035577876, + "learning_rate": 7.099460985144784e-06, + "loss": 0.682, + "step": 7515 + }, + { + "epoch": 1.28, + "grad_norm": 35.91501841354662, + "learning_rate": 7.094971532834601e-06, + "loss": 0.6485, + "step": 7520 + }, + { + "epoch": 1.28, + "grad_norm": 14.892892396192009, + "learning_rate": 7.090480030957261e-06, + "loss": 0.6903, + "step": 7525 + }, + { + "epoch": 1.28, + "grad_norm": 17.90987688639934, + "learning_rate": 7.0859864839069205e-06, + "loss": 0.6636, + "step": 7530 + }, + { + "epoch": 1.28, + "grad_norm": 21.575729746016865, + "learning_rate": 7.081490896079738e-06, + "loss": 0.6779, + "step": 7535 + }, + { + "epoch": 1.28, + "grad_norm": 8.673927795129549, + "learning_rate": 7.076993271873871e-06, + "loss": 0.6817, + "step": 7540 + }, + { + "epoch": 1.28, + "grad_norm": 11.597632317299354, + "learning_rate": 7.072493615689464e-06, + "loss": 0.6743, + "step": 7545 + }, + { + "epoch": 1.28, + "grad_norm": 20.80844236847429, + "learning_rate": 7.067991931928653e-06, + "loss": 0.6773, + "step": 7550 + }, + { + "epoch": 1.28, + "grad_norm": 11.088359347091162, + "learning_rate": 7.063488224995555e-06, + "loss": 0.6706, + "step": 7555 + }, + { + "epoch": 1.29, + "grad_norm": 14.031454429113667, + "learning_rate": 7.058982499296271e-06, + "loss": 0.6812, + "step": 7560 + }, + { + "epoch": 1.29, + "grad_norm": 16.189777793794025, + "learning_rate": 7.0544747592388705e-06, + "loss": 0.677, + "step": 7565 + }, + { + "epoch": 1.29, + "grad_norm": 14.163766472519798, + "learning_rate": 7.049965009233399e-06, + "loss": 0.6629, + "step": 7570 + }, + { + "epoch": 1.29, + "grad_norm": 6.698400255214532, + "learning_rate": 7.045453253691868e-06, + "loss": 0.6697, + "step": 7575 + }, + { + "epoch": 1.29, + "grad_norm": 6.874532923503898, + "learning_rate": 7.040939497028247e-06, + "loss": 0.6755, + "step": 7580 + }, + { + "epoch": 1.29, + "grad_norm": 16.421821705632198, + "learning_rate": 7.0364237436584685e-06, + "loss": 0.6592, + "step": 7585 + }, + { + "epoch": 1.29, + "grad_norm": 9.231467938299327, + "learning_rate": 7.031905998000414e-06, + "loss": 0.662, + "step": 7590 + }, + { + "epoch": 1.29, + "grad_norm": 36.750873525169936, + "learning_rate": 7.027386264473914e-06, + "loss": 0.6562, + "step": 7595 + }, + { + "epoch": 1.29, + "grad_norm": 16.722367709743043, + "learning_rate": 7.022864547500751e-06, + "loss": 0.6704, + "step": 7600 + }, + { + "epoch": 1.29, + "grad_norm": 7.028304612662503, + "learning_rate": 7.018340851504637e-06, + "loss": 0.6637, + "step": 7605 + }, + { + "epoch": 1.29, + "grad_norm": 10.942067762843156, + "learning_rate": 7.01381518091123e-06, + "loss": 0.6681, + "step": 7610 + }, + { + "epoch": 1.29, + "grad_norm": 13.57676824119045, + "learning_rate": 7.009287540148113e-06, + "loss": 0.6918, + "step": 7615 + }, + { + "epoch": 1.3, + "grad_norm": 11.920375170456783, + "learning_rate": 7.004757933644801e-06, + "loss": 0.6575, + "step": 7620 + }, + { + "epoch": 1.3, + "grad_norm": 14.322859449563097, + "learning_rate": 7.000226365832729e-06, + "loss": 0.682, + "step": 7625 + }, + { + "epoch": 1.3, + "grad_norm": 9.312229271567947, + "learning_rate": 6.995692841145253e-06, + "loss": 0.6701, + "step": 7630 + }, + { + "epoch": 1.3, + "grad_norm": 7.707392654136837, + "learning_rate": 6.991157364017642e-06, + "loss": 0.6654, + "step": 7635 + }, + { + "epoch": 1.3, + "grad_norm": 6.796894963851315, + "learning_rate": 6.986619938887076e-06, + "loss": 0.6578, + "step": 7640 + }, + { + "epoch": 1.3, + "grad_norm": 11.853471127640145, + "learning_rate": 6.982080570192638e-06, + "loss": 0.6677, + "step": 7645 + }, + { + "epoch": 1.3, + "grad_norm": 12.12977247638146, + "learning_rate": 6.977539262375318e-06, + "loss": 0.658, + "step": 7650 + }, + { + "epoch": 1.3, + "grad_norm": 12.699221620073573, + "learning_rate": 6.972996019877998e-06, + "loss": 0.659, + "step": 7655 + }, + { + "epoch": 1.3, + "grad_norm": 7.424426213591365, + "learning_rate": 6.968450847145456e-06, + "loss": 0.6668, + "step": 7660 + }, + { + "epoch": 1.3, + "grad_norm": 18.343320233271434, + "learning_rate": 6.963903748624356e-06, + "loss": 0.6363, + "step": 7665 + }, + { + "epoch": 1.3, + "grad_norm": 27.69523332992964, + "learning_rate": 6.959354728763247e-06, + "loss": 0.6543, + "step": 7670 + }, + { + "epoch": 1.3, + "grad_norm": 7.276146037274146, + "learning_rate": 6.954803792012559e-06, + "loss": 0.6855, + "step": 7675 + }, + { + "epoch": 1.31, + "grad_norm": 5.700634905867604, + "learning_rate": 6.950250942824595e-06, + "loss": 0.6564, + "step": 7680 + }, + { + "epoch": 1.31, + "grad_norm": 17.506750873615168, + "learning_rate": 6.945696185653532e-06, + "loss": 0.6768, + "step": 7685 + }, + { + "epoch": 1.31, + "grad_norm": 16.791087185631472, + "learning_rate": 6.941139524955409e-06, + "loss": 0.6658, + "step": 7690 + }, + { + "epoch": 1.31, + "grad_norm": 8.266362817079182, + "learning_rate": 6.936580965188133e-06, + "loss": 0.6703, + "step": 7695 + }, + { + "epoch": 1.31, + "grad_norm": 16.05943328414273, + "learning_rate": 6.9320205108114634e-06, + "loss": 0.634, + "step": 7700 + }, + { + "epoch": 1.31, + "grad_norm": 34.555429787519394, + "learning_rate": 6.927458166287017e-06, + "loss": 0.6676, + "step": 7705 + }, + { + "epoch": 1.31, + "grad_norm": 7.106677063243689, + "learning_rate": 6.9228939360782585e-06, + "loss": 0.6558, + "step": 7710 + }, + { + "epoch": 1.31, + "grad_norm": 15.555116707313184, + "learning_rate": 6.918327824650497e-06, + "loss": 0.6678, + "step": 7715 + }, + { + "epoch": 1.31, + "grad_norm": 34.94775840826066, + "learning_rate": 6.913759836470884e-06, + "loss": 0.6676, + "step": 7720 + }, + { + "epoch": 1.31, + "grad_norm": 23.503473494897815, + "learning_rate": 6.9091899760084025e-06, + "loss": 0.6712, + "step": 7725 + }, + { + "epoch": 1.31, + "grad_norm": 7.198449503546897, + "learning_rate": 6.904618247733874e-06, + "loss": 0.6821, + "step": 7730 + }, + { + "epoch": 1.32, + "grad_norm": 6.423391165147742, + "learning_rate": 6.90004465611994e-06, + "loss": 0.665, + "step": 7735 + }, + { + "epoch": 1.32, + "grad_norm": 27.492408282325805, + "learning_rate": 6.895469205641071e-06, + "loss": 0.6656, + "step": 7740 + }, + { + "epoch": 1.32, + "grad_norm": 27.131001091554225, + "learning_rate": 6.890891900773552e-06, + "loss": 0.6466, + "step": 7745 + }, + { + "epoch": 1.32, + "grad_norm": 12.343118919497925, + "learning_rate": 6.886312745995485e-06, + "loss": 0.6613, + "step": 7750 + }, + { + "epoch": 1.32, + "grad_norm": 26.29602942837788, + "learning_rate": 6.881731745786779e-06, + "loss": 0.6744, + "step": 7755 + }, + { + "epoch": 1.32, + "grad_norm": 34.93533020196708, + "learning_rate": 6.877148904629154e-06, + "loss": 0.6661, + "step": 7760 + }, + { + "epoch": 1.32, + "grad_norm": 20.015281493550773, + "learning_rate": 6.872564227006122e-06, + "loss": 0.6672, + "step": 7765 + }, + { + "epoch": 1.32, + "grad_norm": 8.05475189958386, + "learning_rate": 6.867977717403e-06, + "loss": 0.6693, + "step": 7770 + }, + { + "epoch": 1.32, + "grad_norm": 19.722271714945407, + "learning_rate": 6.863389380306894e-06, + "loss": 0.6785, + "step": 7775 + }, + { + "epoch": 1.32, + "grad_norm": 13.594100948343948, + "learning_rate": 6.858799220206698e-06, + "loss": 0.6808, + "step": 7780 + }, + { + "epoch": 1.32, + "grad_norm": 11.256070128775296, + "learning_rate": 6.854207241593086e-06, + "loss": 0.6684, + "step": 7785 + }, + { + "epoch": 1.32, + "grad_norm": 7.284928024061565, + "learning_rate": 6.849613448958518e-06, + "loss": 0.6584, + "step": 7790 + }, + { + "epoch": 1.33, + "grad_norm": 10.343801575420452, + "learning_rate": 6.845017846797224e-06, + "loss": 0.6453, + "step": 7795 + }, + { + "epoch": 1.33, + "grad_norm": 5.988882531705171, + "learning_rate": 6.840420439605207e-06, + "loss": 0.6343, + "step": 7800 + }, + { + "epoch": 1.33, + "grad_norm": 20.704821166327548, + "learning_rate": 6.835821231880233e-06, + "loss": 0.6438, + "step": 7805 + }, + { + "epoch": 1.33, + "grad_norm": 23.222455538039853, + "learning_rate": 6.831220228121831e-06, + "loss": 0.6546, + "step": 7810 + }, + { + "epoch": 1.33, + "grad_norm": 21.479424891039326, + "learning_rate": 6.826617432831286e-06, + "loss": 0.6563, + "step": 7815 + }, + { + "epoch": 1.33, + "grad_norm": 42.80175925549784, + "learning_rate": 6.822012850511641e-06, + "loss": 0.6724, + "step": 7820 + }, + { + "epoch": 1.33, + "grad_norm": 27.776123859447747, + "learning_rate": 6.81740648566768e-06, + "loss": 0.6648, + "step": 7825 + }, + { + "epoch": 1.33, + "grad_norm": 9.208497254204323, + "learning_rate": 6.812798342805933e-06, + "loss": 0.6484, + "step": 7830 + }, + { + "epoch": 1.33, + "grad_norm": 31.762321890202262, + "learning_rate": 6.808188426434672e-06, + "loss": 0.6459, + "step": 7835 + }, + { + "epoch": 1.33, + "grad_norm": 8.06389345005782, + "learning_rate": 6.803576741063903e-06, + "loss": 0.6704, + "step": 7840 + }, + { + "epoch": 1.33, + "grad_norm": 8.02164928639619, + "learning_rate": 6.79896329120536e-06, + "loss": 0.6617, + "step": 7845 + }, + { + "epoch": 1.33, + "grad_norm": 16.709489451722483, + "learning_rate": 6.794348081372507e-06, + "loss": 0.6528, + "step": 7850 + }, + { + "epoch": 1.34, + "grad_norm": 15.238437551559613, + "learning_rate": 6.789731116080529e-06, + "loss": 0.6407, + "step": 7855 + }, + { + "epoch": 1.34, + "grad_norm": 11.206729482619727, + "learning_rate": 6.785112399846328e-06, + "loss": 0.6625, + "step": 7860 + }, + { + "epoch": 1.34, + "grad_norm": 7.776138225675623, + "learning_rate": 6.780491937188514e-06, + "loss": 0.6543, + "step": 7865 + }, + { + "epoch": 1.34, + "grad_norm": 11.706557883721104, + "learning_rate": 6.775869732627417e-06, + "loss": 0.6576, + "step": 7870 + }, + { + "epoch": 1.34, + "grad_norm": 10.764273172448627, + "learning_rate": 6.771245790685059e-06, + "loss": 0.6767, + "step": 7875 + }, + { + "epoch": 1.34, + "grad_norm": 52.38525602824595, + "learning_rate": 6.766620115885172e-06, + "loss": 0.6719, + "step": 7880 + }, + { + "epoch": 1.34, + "grad_norm": 15.77477399033817, + "learning_rate": 6.761992712753173e-06, + "loss": 0.6544, + "step": 7885 + }, + { + "epoch": 1.34, + "grad_norm": 12.49369698024647, + "learning_rate": 6.757363585816178e-06, + "loss": 0.6428, + "step": 7890 + }, + { + "epoch": 1.34, + "grad_norm": 7.283967365331387, + "learning_rate": 6.7527327396029875e-06, + "loss": 0.6457, + "step": 7895 + }, + { + "epoch": 1.34, + "grad_norm": 6.407807058273839, + "learning_rate": 6.748100178644082e-06, + "loss": 0.6507, + "step": 7900 + }, + { + "epoch": 1.34, + "grad_norm": 15.881801966972798, + "learning_rate": 6.743465907471623e-06, + "loss": 0.663, + "step": 7905 + }, + { + "epoch": 1.34, + "grad_norm": 18.894217087735242, + "learning_rate": 6.738829930619438e-06, + "loss": 0.6609, + "step": 7910 + }, + { + "epoch": 1.35, + "grad_norm": 15.343860422804275, + "learning_rate": 6.734192252623034e-06, + "loss": 0.6484, + "step": 7915 + }, + { + "epoch": 1.35, + "grad_norm": 12.817117023965405, + "learning_rate": 6.729552878019574e-06, + "loss": 0.6381, + "step": 7920 + }, + { + "epoch": 1.35, + "grad_norm": 17.166428445424156, + "learning_rate": 6.724911811347883e-06, + "loss": 0.6493, + "step": 7925 + }, + { + "epoch": 1.35, + "grad_norm": 10.180628433066271, + "learning_rate": 6.7202690571484406e-06, + "loss": 0.6647, + "step": 7930 + }, + { + "epoch": 1.35, + "grad_norm": 14.866832953011489, + "learning_rate": 6.71562461996338e-06, + "loss": 0.6332, + "step": 7935 + }, + { + "epoch": 1.35, + "grad_norm": 21.691113499249862, + "learning_rate": 6.71097850433648e-06, + "loss": 0.6726, + "step": 7940 + }, + { + "epoch": 1.35, + "grad_norm": 10.55997210610206, + "learning_rate": 6.706330714813161e-06, + "loss": 0.6354, + "step": 7945 + }, + { + "epoch": 1.35, + "grad_norm": 19.705598228965442, + "learning_rate": 6.701681255940478e-06, + "loss": 0.6406, + "step": 7950 + }, + { + "epoch": 1.35, + "grad_norm": 7.295922874790215, + "learning_rate": 6.697030132267124e-06, + "loss": 0.6578, + "step": 7955 + }, + { + "epoch": 1.35, + "grad_norm": 16.841094205265872, + "learning_rate": 6.692377348343419e-06, + "loss": 0.6552, + "step": 7960 + }, + { + "epoch": 1.35, + "grad_norm": 22.95613214066316, + "learning_rate": 6.687722908721308e-06, + "loss": 0.6734, + "step": 7965 + }, + { + "epoch": 1.35, + "grad_norm": 27.35634967120633, + "learning_rate": 6.683066817954353e-06, + "loss": 0.6497, + "step": 7970 + }, + { + "epoch": 1.36, + "grad_norm": 14.417662535768118, + "learning_rate": 6.678409080597732e-06, + "loss": 0.6628, + "step": 7975 + }, + { + "epoch": 1.36, + "grad_norm": 10.305778063010779, + "learning_rate": 6.673749701208239e-06, + "loss": 0.6278, + "step": 7980 + }, + { + "epoch": 1.36, + "grad_norm": 12.067243154378666, + "learning_rate": 6.669088684344266e-06, + "loss": 0.6304, + "step": 7985 + }, + { + "epoch": 1.36, + "grad_norm": 7.237713121087854, + "learning_rate": 6.664426034565814e-06, + "loss": 0.6533, + "step": 7990 + }, + { + "epoch": 1.36, + "grad_norm": 6.6883233127354975, + "learning_rate": 6.6597617564344796e-06, + "loss": 0.6452, + "step": 7995 + }, + { + "epoch": 1.36, + "grad_norm": 6.400448497043079, + "learning_rate": 6.6550958545134515e-06, + "loss": 0.6489, + "step": 8000 + }, + { + "epoch": 1.36, + "grad_norm": 7.767147219100383, + "learning_rate": 6.6504283333675065e-06, + "loss": 0.6449, + "step": 8005 + }, + { + "epoch": 1.36, + "grad_norm": 10.370384698947925, + "learning_rate": 6.645759197563008e-06, + "loss": 0.6429, + "step": 8010 + }, + { + "epoch": 1.36, + "grad_norm": 7.061534266870285, + "learning_rate": 6.641088451667894e-06, + "loss": 0.662, + "step": 8015 + }, + { + "epoch": 1.36, + "grad_norm": 21.18127236595691, + "learning_rate": 6.636416100251687e-06, + "loss": 0.6504, + "step": 8020 + }, + { + "epoch": 1.36, + "grad_norm": 18.535465862975343, + "learning_rate": 6.631742147885468e-06, + "loss": 0.6772, + "step": 8025 + }, + { + "epoch": 1.37, + "grad_norm": 6.964640726111356, + "learning_rate": 6.627066599141895e-06, + "loss": 0.6597, + "step": 8030 + }, + { + "epoch": 1.37, + "grad_norm": 11.845507943939163, + "learning_rate": 6.622389458595182e-06, + "loss": 0.6469, + "step": 8035 + }, + { + "epoch": 1.37, + "grad_norm": 16.080552123187886, + "learning_rate": 6.617710730821103e-06, + "loss": 0.6429, + "step": 8040 + }, + { + "epoch": 1.37, + "grad_norm": 21.511449344925648, + "learning_rate": 6.6130304203969805e-06, + "loss": 0.6482, + "step": 8045 + }, + { + "epoch": 1.37, + "grad_norm": 10.772854516081404, + "learning_rate": 6.608348531901692e-06, + "loss": 0.6619, + "step": 8050 + }, + { + "epoch": 1.37, + "grad_norm": 12.625015490224003, + "learning_rate": 6.603665069915654e-06, + "loss": 0.6568, + "step": 8055 + }, + { + "epoch": 1.37, + "grad_norm": 25.861303879556942, + "learning_rate": 6.5989800390208226e-06, + "loss": 0.6386, + "step": 8060 + }, + { + "epoch": 1.37, + "grad_norm": 17.4888690477709, + "learning_rate": 6.59429344380069e-06, + "loss": 0.6436, + "step": 8065 + }, + { + "epoch": 1.37, + "grad_norm": 12.131438242239497, + "learning_rate": 6.5896052888402805e-06, + "loss": 0.6476, + "step": 8070 + }, + { + "epoch": 1.37, + "grad_norm": 12.257155345532185, + "learning_rate": 6.584915578726141e-06, + "loss": 0.6393, + "step": 8075 + }, + { + "epoch": 1.37, + "grad_norm": 17.259442077739987, + "learning_rate": 6.5802243180463425e-06, + "loss": 0.6219, + "step": 8080 + }, + { + "epoch": 1.37, + "grad_norm": 6.655363886815646, + "learning_rate": 6.575531511390469e-06, + "loss": 0.6438, + "step": 8085 + }, + { + "epoch": 1.38, + "grad_norm": 15.89276103817776, + "learning_rate": 6.570837163349624e-06, + "loss": 0.6257, + "step": 8090 + }, + { + "epoch": 1.38, + "grad_norm": 11.058137605602374, + "learning_rate": 6.566141278516413e-06, + "loss": 0.6516, + "step": 8095 + }, + { + "epoch": 1.38, + "grad_norm": 48.71355207579282, + "learning_rate": 6.561443861484946e-06, + "loss": 0.6402, + "step": 8100 + }, + { + "epoch": 1.38, + "grad_norm": 28.380628719561553, + "learning_rate": 6.5567449168508346e-06, + "loss": 0.654, + "step": 8105 + }, + { + "epoch": 1.38, + "grad_norm": 6.411789938460549, + "learning_rate": 6.552044449211181e-06, + "loss": 0.6552, + "step": 8110 + }, + { + "epoch": 1.38, + "grad_norm": 22.60346491316076, + "learning_rate": 6.54734246316458e-06, + "loss": 0.6531, + "step": 8115 + }, + { + "epoch": 1.38, + "grad_norm": 11.151539620095777, + "learning_rate": 6.542638963311112e-06, + "loss": 0.649, + "step": 8120 + }, + { + "epoch": 1.38, + "grad_norm": 29.874279752302918, + "learning_rate": 6.537933954252338e-06, + "loss": 0.6307, + "step": 8125 + }, + { + "epoch": 1.38, + "grad_norm": 55.371100766921124, + "learning_rate": 6.533227440591294e-06, + "loss": 0.6719, + "step": 8130 + }, + { + "epoch": 1.38, + "grad_norm": 13.23924904787888, + "learning_rate": 6.5285194269324895e-06, + "loss": 0.6554, + "step": 8135 + }, + { + "epoch": 1.38, + "grad_norm": 6.542786323664106, + "learning_rate": 6.523809917881902e-06, + "loss": 0.6363, + "step": 8140 + }, + { + "epoch": 1.38, + "grad_norm": 11.138369074140726, + "learning_rate": 6.5190989180469736e-06, + "loss": 0.6489, + "step": 8145 + }, + { + "epoch": 1.39, + "grad_norm": 20.644221672684967, + "learning_rate": 6.514386432036598e-06, + "loss": 0.6514, + "step": 8150 + }, + { + "epoch": 1.39, + "grad_norm": 7.53784502326818, + "learning_rate": 6.5096724644611296e-06, + "loss": 0.6505, + "step": 8155 + }, + { + "epoch": 1.39, + "grad_norm": 13.832374181994506, + "learning_rate": 6.5049570199323686e-06, + "loss": 0.6431, + "step": 8160 + }, + { + "epoch": 1.39, + "grad_norm": 9.328834025436, + "learning_rate": 6.500240103063564e-06, + "loss": 0.6243, + "step": 8165 + }, + { + "epoch": 1.39, + "grad_norm": 15.916018663546673, + "learning_rate": 6.495521718469402e-06, + "loss": 0.6457, + "step": 8170 + }, + { + "epoch": 1.39, + "grad_norm": 23.457435094958125, + "learning_rate": 6.490801870766004e-06, + "loss": 0.6568, + "step": 8175 + }, + { + "epoch": 1.39, + "grad_norm": 35.150856775834455, + "learning_rate": 6.486080564570925e-06, + "loss": 0.6513, + "step": 8180 + }, + { + "epoch": 1.39, + "grad_norm": 7.170893686186264, + "learning_rate": 6.481357804503147e-06, + "loss": 0.6362, + "step": 8185 + }, + { + "epoch": 1.39, + "grad_norm": 15.889101730266102, + "learning_rate": 6.476633595183073e-06, + "loss": 0.6487, + "step": 8190 + }, + { + "epoch": 1.39, + "grad_norm": 9.13019074213934, + "learning_rate": 6.471907941232525e-06, + "loss": 0.6449, + "step": 8195 + }, + { + "epoch": 1.39, + "grad_norm": 16.939034783675478, + "learning_rate": 6.467180847274737e-06, + "loss": 0.6366, + "step": 8200 + }, + { + "epoch": 1.39, + "grad_norm": 8.291379823253529, + "learning_rate": 6.462452317934352e-06, + "loss": 0.6348, + "step": 8205 + }, + { + "epoch": 1.4, + "grad_norm": 8.535236612949914, + "learning_rate": 6.45772235783742e-06, + "loss": 0.6284, + "step": 8210 + }, + { + "epoch": 1.4, + "grad_norm": 7.745474723331273, + "learning_rate": 6.452990971611384e-06, + "loss": 0.6245, + "step": 8215 + }, + { + "epoch": 1.4, + "grad_norm": 29.761555067812456, + "learning_rate": 6.448258163885092e-06, + "loss": 0.6626, + "step": 8220 + }, + { + "epoch": 1.4, + "grad_norm": 7.001809069262323, + "learning_rate": 6.443523939288776e-06, + "loss": 0.6446, + "step": 8225 + }, + { + "epoch": 1.4, + "grad_norm": 14.06191759505353, + "learning_rate": 6.438788302454053e-06, + "loss": 0.6637, + "step": 8230 + }, + { + "epoch": 1.4, + "grad_norm": 10.716149188351201, + "learning_rate": 6.434051258013928e-06, + "loss": 0.6606, + "step": 8235 + }, + { + "epoch": 1.4, + "grad_norm": 5.812890077780472, + "learning_rate": 6.429312810602777e-06, + "loss": 0.6329, + "step": 8240 + }, + { + "epoch": 1.4, + "grad_norm": 8.844538478736041, + "learning_rate": 6.424572964856351e-06, + "loss": 0.6463, + "step": 8245 + }, + { + "epoch": 1.4, + "grad_norm": 15.81354924758096, + "learning_rate": 6.4198317254117695e-06, + "loss": 0.6431, + "step": 8250 + }, + { + "epoch": 1.4, + "grad_norm": 15.492158449379378, + "learning_rate": 6.4150890969075145e-06, + "loss": 0.6432, + "step": 8255 + }, + { + "epoch": 1.4, + "grad_norm": 6.759512578968948, + "learning_rate": 6.410345083983427e-06, + "loss": 0.6303, + "step": 8260 + }, + { + "epoch": 1.41, + "grad_norm": 13.119349300664416, + "learning_rate": 6.405599691280706e-06, + "loss": 0.6543, + "step": 8265 + }, + { + "epoch": 1.41, + "grad_norm": 33.850079790425276, + "learning_rate": 6.400852923441892e-06, + "loss": 0.6455, + "step": 8270 + }, + { + "epoch": 1.41, + "grad_norm": 27.973834250364234, + "learning_rate": 6.3961047851108795e-06, + "loss": 0.6439, + "step": 8275 + }, + { + "epoch": 1.41, + "grad_norm": 18.791402441709256, + "learning_rate": 6.391355280932898e-06, + "loss": 0.6526, + "step": 8280 + }, + { + "epoch": 1.41, + "grad_norm": 6.8898792407939835, + "learning_rate": 6.386604415554521e-06, + "loss": 0.6257, + "step": 8285 + }, + { + "epoch": 1.41, + "grad_norm": 6.302907884289335, + "learning_rate": 6.381852193623641e-06, + "loss": 0.6526, + "step": 8290 + }, + { + "epoch": 1.41, + "grad_norm": 24.989620308397974, + "learning_rate": 6.377098619789489e-06, + "loss": 0.6402, + "step": 8295 + }, + { + "epoch": 1.41, + "grad_norm": 27.03639392548826, + "learning_rate": 6.372343698702612e-06, + "loss": 0.6241, + "step": 8300 + }, + { + "epoch": 1.41, + "grad_norm": 26.418474973434172, + "learning_rate": 6.367587435014882e-06, + "loss": 0.6298, + "step": 8305 + }, + { + "epoch": 1.41, + "grad_norm": 15.467358507902656, + "learning_rate": 6.362829833379475e-06, + "loss": 0.6539, + "step": 8310 + }, + { + "epoch": 1.41, + "grad_norm": 16.390598419534992, + "learning_rate": 6.358070898450884e-06, + "loss": 0.6575, + "step": 8315 + }, + { + "epoch": 1.41, + "grad_norm": 22.18745112063696, + "learning_rate": 6.353310634884901e-06, + "loss": 0.6289, + "step": 8320 + }, + { + "epoch": 1.42, + "grad_norm": 18.267245620225978, + "learning_rate": 6.348549047338626e-06, + "loss": 0.637, + "step": 8325 + }, + { + "epoch": 1.42, + "grad_norm": 13.441981411328962, + "learning_rate": 6.343786140470441e-06, + "loss": 0.6309, + "step": 8330 + }, + { + "epoch": 1.42, + "grad_norm": 14.388295631417312, + "learning_rate": 6.339021918940031e-06, + "loss": 0.6482, + "step": 8335 + }, + { + "epoch": 1.42, + "grad_norm": 20.34027340053568, + "learning_rate": 6.33425638740836e-06, + "loss": 0.6316, + "step": 8340 + }, + { + "epoch": 1.42, + "grad_norm": 18.72616964408831, + "learning_rate": 6.3294895505376784e-06, + "loss": 0.6445, + "step": 8345 + }, + { + "epoch": 1.42, + "grad_norm": 16.599848289761585, + "learning_rate": 6.324721412991509e-06, + "loss": 0.6373, + "step": 8350 + }, + { + "epoch": 1.42, + "grad_norm": 10.225968332536441, + "learning_rate": 6.31995197943465e-06, + "loss": 0.6426, + "step": 8355 + }, + { + "epoch": 1.42, + "grad_norm": 19.216949694680935, + "learning_rate": 6.315181254533167e-06, + "loss": 0.631, + "step": 8360 + }, + { + "epoch": 1.42, + "grad_norm": 8.470168620240177, + "learning_rate": 6.310409242954392e-06, + "loss": 0.6312, + "step": 8365 + }, + { + "epoch": 1.42, + "grad_norm": 15.414773113372265, + "learning_rate": 6.305635949366906e-06, + "loss": 0.6385, + "step": 8370 + }, + { + "epoch": 1.42, + "grad_norm": 34.07839646444887, + "learning_rate": 6.300861378440557e-06, + "loss": 0.6402, + "step": 8375 + }, + { + "epoch": 1.42, + "grad_norm": 16.71560255107337, + "learning_rate": 6.296085534846433e-06, + "loss": 0.6279, + "step": 8380 + }, + { + "epoch": 1.43, + "grad_norm": 37.75055046604799, + "learning_rate": 6.291308423256873e-06, + "loss": 0.622, + "step": 8385 + }, + { + "epoch": 1.43, + "grad_norm": 31.051698238500787, + "learning_rate": 6.286530048345452e-06, + "loss": 0.6399, + "step": 8390 + }, + { + "epoch": 1.43, + "grad_norm": 15.138592809970431, + "learning_rate": 6.281750414786983e-06, + "loss": 0.6423, + "step": 8395 + }, + { + "epoch": 1.43, + "grad_norm": 29.79002494254909, + "learning_rate": 6.276969527257513e-06, + "loss": 0.6361, + "step": 8400 + }, + { + "epoch": 1.43, + "grad_norm": 28.102533538762536, + "learning_rate": 6.272187390434311e-06, + "loss": 0.626, + "step": 8405 + }, + { + "epoch": 1.43, + "grad_norm": 31.97698553168282, + "learning_rate": 6.26740400899587e-06, + "loss": 0.6422, + "step": 8410 + }, + { + "epoch": 1.43, + "grad_norm": 9.465448441293175, + "learning_rate": 6.262619387621902e-06, + "loss": 0.6367, + "step": 8415 + }, + { + "epoch": 1.43, + "grad_norm": 7.061458649998584, + "learning_rate": 6.257833530993332e-06, + "loss": 0.6335, + "step": 8420 + }, + { + "epoch": 1.43, + "grad_norm": 15.88280726412742, + "learning_rate": 6.2530464437922936e-06, + "loss": 0.6312, + "step": 8425 + }, + { + "epoch": 1.43, + "grad_norm": 18.12066800542379, + "learning_rate": 6.2482581307021195e-06, + "loss": 0.6124, + "step": 8430 + }, + { + "epoch": 1.43, + "grad_norm": 16.94764296608338, + "learning_rate": 6.243468596407348e-06, + "loss": 0.615, + "step": 8435 + }, + { + "epoch": 1.43, + "grad_norm": 7.307650693950313, + "learning_rate": 6.238677845593709e-06, + "loss": 0.629, + "step": 8440 + }, + { + "epoch": 1.44, + "grad_norm": 8.164608367344734, + "learning_rate": 6.233885882948124e-06, + "loss": 0.6238, + "step": 8445 + }, + { + "epoch": 1.44, + "grad_norm": 10.618845576266027, + "learning_rate": 6.229092713158699e-06, + "loss": 0.6307, + "step": 8450 + }, + { + "epoch": 1.44, + "grad_norm": 5.999939335710288, + "learning_rate": 6.22429834091472e-06, + "loss": 0.6252, + "step": 8455 + }, + { + "epoch": 1.44, + "grad_norm": 27.277384481971897, + "learning_rate": 6.219502770906652e-06, + "loss": 0.6255, + "step": 8460 + }, + { + "epoch": 1.44, + "grad_norm": 26.658657226008486, + "learning_rate": 6.214706007826133e-06, + "loss": 0.6336, + "step": 8465 + }, + { + "epoch": 1.44, + "grad_norm": 9.790938205285673, + "learning_rate": 6.20990805636596e-06, + "loss": 0.6216, + "step": 8470 + }, + { + "epoch": 1.44, + "grad_norm": 10.181913156116149, + "learning_rate": 6.205108921220102e-06, + "loss": 0.6092, + "step": 8475 + }, + { + "epoch": 1.44, + "grad_norm": 6.651943934098993, + "learning_rate": 6.200308607083683e-06, + "loss": 0.6349, + "step": 8480 + }, + { + "epoch": 1.44, + "grad_norm": 10.036649904845357, + "learning_rate": 6.195507118652977e-06, + "loss": 0.6251, + "step": 8485 + }, + { + "epoch": 1.44, + "grad_norm": 14.146870640781684, + "learning_rate": 6.190704460625412e-06, + "loss": 0.6372, + "step": 8490 + }, + { + "epoch": 1.44, + "grad_norm": 11.004606776153171, + "learning_rate": 6.185900637699555e-06, + "loss": 0.6379, + "step": 8495 + }, + { + "epoch": 1.45, + "grad_norm": 12.261848092144794, + "learning_rate": 6.18109565457512e-06, + "loss": 0.6273, + "step": 8500 + }, + { + "epoch": 1.45, + "grad_norm": 8.517734637753321, + "learning_rate": 6.176289515952949e-06, + "loss": 0.6235, + "step": 8505 + }, + { + "epoch": 1.45, + "grad_norm": 16.182064084964512, + "learning_rate": 6.171482226535016e-06, + "loss": 0.6338, + "step": 8510 + }, + { + "epoch": 1.45, + "grad_norm": 27.011998601299425, + "learning_rate": 6.1666737910244234e-06, + "loss": 0.6279, + "step": 8515 + }, + { + "epoch": 1.45, + "grad_norm": 7.461115547233499, + "learning_rate": 6.161864214125393e-06, + "loss": 0.6187, + "step": 8520 + }, + { + "epoch": 1.45, + "grad_norm": 26.957756432468457, + "learning_rate": 6.157053500543265e-06, + "loss": 0.6401, + "step": 8525 + }, + { + "epoch": 1.45, + "grad_norm": 14.81850691520542, + "learning_rate": 6.152241654984488e-06, + "loss": 0.6462, + "step": 8530 + }, + { + "epoch": 1.45, + "grad_norm": 19.24439589408677, + "learning_rate": 6.147428682156621e-06, + "loss": 0.6354, + "step": 8535 + }, + { + "epoch": 1.45, + "grad_norm": 36.13098754949811, + "learning_rate": 6.142614586768325e-06, + "loss": 0.6291, + "step": 8540 + }, + { + "epoch": 1.45, + "grad_norm": 11.549996730002588, + "learning_rate": 6.137799373529361e-06, + "loss": 0.6183, + "step": 8545 + }, + { + "epoch": 1.45, + "grad_norm": 19.749594900839334, + "learning_rate": 6.132983047150579e-06, + "loss": 0.6197, + "step": 8550 + }, + { + "epoch": 1.45, + "grad_norm": 17.905561914987118, + "learning_rate": 6.128165612343923e-06, + "loss": 0.6175, + "step": 8555 + }, + { + "epoch": 1.46, + "grad_norm": 15.52868932543812, + "learning_rate": 6.1233470738224185e-06, + "loss": 0.6218, + "step": 8560 + }, + { + "epoch": 1.46, + "grad_norm": 14.267449464182201, + "learning_rate": 6.118527436300175e-06, + "loss": 0.6316, + "step": 8565 + }, + { + "epoch": 1.46, + "grad_norm": 17.904992821265832, + "learning_rate": 6.11370670449237e-06, + "loss": 0.639, + "step": 8570 + }, + { + "epoch": 1.46, + "grad_norm": 10.478604606870359, + "learning_rate": 6.108884883115256e-06, + "loss": 0.6413, + "step": 8575 + }, + { + "epoch": 1.46, + "grad_norm": 19.41975081931992, + "learning_rate": 6.1040619768861505e-06, + "loss": 0.6376, + "step": 8580 + }, + { + "epoch": 1.46, + "grad_norm": 28.473262465248553, + "learning_rate": 6.099237990523437e-06, + "loss": 0.619, + "step": 8585 + }, + { + "epoch": 1.46, + "grad_norm": 6.790384150641972, + "learning_rate": 6.094412928746546e-06, + "loss": 0.6234, + "step": 8590 + }, + { + "epoch": 1.46, + "grad_norm": 19.06048377191451, + "learning_rate": 6.089586796275968e-06, + "loss": 0.6226, + "step": 8595 + }, + { + "epoch": 1.46, + "grad_norm": 23.277298880330747, + "learning_rate": 6.084759597833239e-06, + "loss": 0.6264, + "step": 8600 + }, + { + "epoch": 1.46, + "grad_norm": 8.554662803269148, + "learning_rate": 6.079931338140936e-06, + "loss": 0.6305, + "step": 8605 + }, + { + "epoch": 1.46, + "grad_norm": 19.270349394789058, + "learning_rate": 6.075102021922677e-06, + "loss": 0.6439, + "step": 8610 + }, + { + "epoch": 1.46, + "grad_norm": 67.3469154291316, + "learning_rate": 6.070271653903112e-06, + "loss": 0.6062, + "step": 8615 + }, + { + "epoch": 1.47, + "grad_norm": 27.770308384712457, + "learning_rate": 6.06544023880792e-06, + "loss": 0.6329, + "step": 8620 + }, + { + "epoch": 1.47, + "grad_norm": 25.407479290475464, + "learning_rate": 6.060607781363807e-06, + "loss": 0.6315, + "step": 8625 + }, + { + "epoch": 1.47, + "grad_norm": 15.509769672420044, + "learning_rate": 6.055774286298492e-06, + "loss": 0.6365, + "step": 8630 + }, + { + "epoch": 1.47, + "grad_norm": 27.6938281196245, + "learning_rate": 6.050939758340716e-06, + "loss": 0.6473, + "step": 8635 + }, + { + "epoch": 1.47, + "grad_norm": 6.082490505267428, + "learning_rate": 6.046104202220228e-06, + "loss": 0.614, + "step": 8640 + }, + { + "epoch": 1.47, + "grad_norm": 10.77340992162146, + "learning_rate": 6.041267622667784e-06, + "loss": 0.6326, + "step": 8645 + }, + { + "epoch": 1.47, + "grad_norm": 10.152095095597254, + "learning_rate": 6.0364300244151385e-06, + "loss": 0.6233, + "step": 8650 + }, + { + "epoch": 1.47, + "grad_norm": 23.517070545128238, + "learning_rate": 6.031591412195046e-06, + "loss": 0.621, + "step": 8655 + }, + { + "epoch": 1.47, + "grad_norm": 10.288564339786074, + "learning_rate": 6.02675179074125e-06, + "loss": 0.6504, + "step": 8660 + }, + { + "epoch": 1.47, + "grad_norm": 6.512244767455876, + "learning_rate": 6.021911164788483e-06, + "loss": 0.6419, + "step": 8665 + }, + { + "epoch": 1.47, + "grad_norm": 11.139035823232772, + "learning_rate": 6.0170695390724595e-06, + "loss": 0.6154, + "step": 8670 + }, + { + "epoch": 1.47, + "grad_norm": 30.611473228247544, + "learning_rate": 6.012226918329874e-06, + "loss": 0.6163, + "step": 8675 + }, + { + "epoch": 1.48, + "grad_norm": 23.228208297353323, + "learning_rate": 6.007383307298391e-06, + "loss": 0.6235, + "step": 8680 + }, + { + "epoch": 1.48, + "grad_norm": 17.16179449932119, + "learning_rate": 6.002538710716649e-06, + "loss": 0.6376, + "step": 8685 + }, + { + "epoch": 1.48, + "grad_norm": 11.613436166968452, + "learning_rate": 5.997693133324244e-06, + "loss": 0.6184, + "step": 8690 + }, + { + "epoch": 1.48, + "grad_norm": 15.000653168372537, + "learning_rate": 5.992846579861737e-06, + "loss": 0.6334, + "step": 8695 + }, + { + "epoch": 1.48, + "grad_norm": 21.01047179127842, + "learning_rate": 5.98799905507064e-06, + "loss": 0.6309, + "step": 8700 + }, + { + "epoch": 1.48, + "grad_norm": 9.397516068200028, + "learning_rate": 5.98315056369342e-06, + "loss": 0.6194, + "step": 8705 + }, + { + "epoch": 1.48, + "grad_norm": 10.549455568952771, + "learning_rate": 5.978301110473486e-06, + "loss": 0.6266, + "step": 8710 + }, + { + "epoch": 1.48, + "grad_norm": 8.407050992301864, + "learning_rate": 5.973450700155187e-06, + "loss": 0.6169, + "step": 8715 + }, + { + "epoch": 1.48, + "grad_norm": 9.507586579212115, + "learning_rate": 5.968599337483814e-06, + "loss": 0.6138, + "step": 8720 + }, + { + "epoch": 1.48, + "grad_norm": 8.604919897561592, + "learning_rate": 5.963747027205583e-06, + "loss": 0.6308, + "step": 8725 + }, + { + "epoch": 1.48, + "grad_norm": 8.18989665536131, + "learning_rate": 5.95889377406764e-06, + "loss": 0.6248, + "step": 8730 + }, + { + "epoch": 1.49, + "grad_norm": 9.554625835371105, + "learning_rate": 5.954039582818053e-06, + "loss": 0.6174, + "step": 8735 + }, + { + "epoch": 1.49, + "grad_norm": 20.92839999126903, + "learning_rate": 5.949184458205811e-06, + "loss": 0.6188, + "step": 8740 + }, + { + "epoch": 1.49, + "grad_norm": 13.267413663263957, + "learning_rate": 5.944328404980813e-06, + "loss": 0.6414, + "step": 8745 + }, + { + "epoch": 1.49, + "grad_norm": 23.29476861797064, + "learning_rate": 5.939471427893862e-06, + "loss": 0.6219, + "step": 8750 + }, + { + "epoch": 1.49, + "grad_norm": 22.204976774549824, + "learning_rate": 5.934613531696677e-06, + "loss": 0.6137, + "step": 8755 + }, + { + "epoch": 1.49, + "grad_norm": 38.20480863930992, + "learning_rate": 5.929754721141863e-06, + "loss": 0.6239, + "step": 8760 + }, + { + "epoch": 1.49, + "grad_norm": 17.401029185094114, + "learning_rate": 5.924895000982929e-06, + "loss": 0.6106, + "step": 8765 + }, + { + "epoch": 1.49, + "grad_norm": 27.96896375943163, + "learning_rate": 5.920034375974267e-06, + "loss": 0.625, + "step": 8770 + }, + { + "epoch": 1.49, + "grad_norm": 15.712006261527762, + "learning_rate": 5.91517285087116e-06, + "loss": 0.6301, + "step": 8775 + }, + { + "epoch": 1.49, + "grad_norm": 10.332878892952024, + "learning_rate": 5.910310430429768e-06, + "loss": 0.6277, + "step": 8780 + }, + { + "epoch": 1.49, + "grad_norm": 8.420058266879154, + "learning_rate": 5.905447119407132e-06, + "loss": 0.6302, + "step": 8785 + }, + { + "epoch": 1.49, + "grad_norm": 20.417067516686085, + "learning_rate": 5.900582922561155e-06, + "loss": 0.6167, + "step": 8790 + }, + { + "epoch": 1.5, + "grad_norm": 7.297247346641591, + "learning_rate": 5.895717844650613e-06, + "loss": 0.6135, + "step": 8795 + }, + { + "epoch": 1.5, + "grad_norm": 31.71322909627282, + "learning_rate": 5.890851890435144e-06, + "loss": 0.6064, + "step": 8800 + }, + { + "epoch": 1.5, + "grad_norm": 15.536266619232359, + "learning_rate": 5.885985064675243e-06, + "loss": 0.609, + "step": 8805 + }, + { + "epoch": 1.5, + "grad_norm": 40.11580915341827, + "learning_rate": 5.881117372132257e-06, + "loss": 0.6287, + "step": 8810 + }, + { + "epoch": 1.5, + "grad_norm": 7.751306111236599, + "learning_rate": 5.876248817568379e-06, + "loss": 0.6161, + "step": 8815 + }, + { + "epoch": 1.5, + "grad_norm": 13.192346826103622, + "learning_rate": 5.871379405746647e-06, + "loss": 0.629, + "step": 8820 + }, + { + "epoch": 1.5, + "grad_norm": 18.071210970161694, + "learning_rate": 5.8665091414309395e-06, + "loss": 0.6142, + "step": 8825 + }, + { + "epoch": 1.5, + "grad_norm": 6.558916531059064, + "learning_rate": 5.861638029385969e-06, + "loss": 0.6144, + "step": 8830 + }, + { + "epoch": 1.5, + "grad_norm": 38.805705537435244, + "learning_rate": 5.856766074377273e-06, + "loss": 0.5987, + "step": 8835 + }, + { + "epoch": 1.5, + "grad_norm": 29.412936674979335, + "learning_rate": 5.851893281171217e-06, + "loss": 0.6259, + "step": 8840 + }, + { + "epoch": 1.5, + "grad_norm": 12.380415844971367, + "learning_rate": 5.8470196545349865e-06, + "loss": 0.6221, + "step": 8845 + }, + { + "epoch": 1.5, + "grad_norm": 21.738500373870316, + "learning_rate": 5.842145199236583e-06, + "loss": 0.6163, + "step": 8850 + }, + { + "epoch": 1.51, + "grad_norm": 44.689830654455925, + "learning_rate": 5.837269920044815e-06, + "loss": 0.6088, + "step": 8855 + }, + { + "epoch": 1.51, + "grad_norm": 22.913294256785697, + "learning_rate": 5.832393821729301e-06, + "loss": 0.6165, + "step": 8860 + }, + { + "epoch": 1.51, + "grad_norm": 22.51389955214667, + "learning_rate": 5.827516909060459e-06, + "loss": 0.5958, + "step": 8865 + }, + { + "epoch": 1.51, + "grad_norm": 21.03446683194453, + "learning_rate": 5.8226391868095064e-06, + "loss": 0.6123, + "step": 8870 + }, + { + "epoch": 1.51, + "grad_norm": 8.297799547031005, + "learning_rate": 5.817760659748448e-06, + "loss": 0.6207, + "step": 8875 + }, + { + "epoch": 1.51, + "grad_norm": 18.64278180009031, + "learning_rate": 5.812881332650079e-06, + "loss": 0.6187, + "step": 8880 + }, + { + "epoch": 1.51, + "grad_norm": 18.235311973907905, + "learning_rate": 5.808001210287978e-06, + "loss": 0.6083, + "step": 8885 + }, + { + "epoch": 1.51, + "grad_norm": 14.63252366698247, + "learning_rate": 5.803120297436498e-06, + "loss": 0.6127, + "step": 8890 + }, + { + "epoch": 1.51, + "grad_norm": 21.192227497855644, + "learning_rate": 5.7982385988707705e-06, + "loss": 0.6142, + "step": 8895 + }, + { + "epoch": 1.51, + "grad_norm": 19.526216402550123, + "learning_rate": 5.793356119366689e-06, + "loss": 0.6367, + "step": 8900 + }, + { + "epoch": 1.51, + "grad_norm": 7.321169117285131, + "learning_rate": 5.788472863700918e-06, + "loss": 0.6125, + "step": 8905 + }, + { + "epoch": 1.51, + "grad_norm": 10.52084507199846, + "learning_rate": 5.7835888366508785e-06, + "loss": 0.6203, + "step": 8910 + }, + { + "epoch": 1.52, + "grad_norm": 6.1271021467657425, + "learning_rate": 5.778704042994744e-06, + "loss": 0.6008, + "step": 8915 + }, + { + "epoch": 1.52, + "grad_norm": 7.254171708462301, + "learning_rate": 5.77381848751144e-06, + "loss": 0.6138, + "step": 8920 + }, + { + "epoch": 1.52, + "grad_norm": 11.320024335417965, + "learning_rate": 5.76893217498064e-06, + "loss": 0.618, + "step": 8925 + }, + { + "epoch": 1.52, + "grad_norm": 16.131875846221742, + "learning_rate": 5.764045110182752e-06, + "loss": 0.6103, + "step": 8930 + }, + { + "epoch": 1.52, + "grad_norm": 10.229969236321889, + "learning_rate": 5.759157297898924e-06, + "loss": 0.6286, + "step": 8935 + }, + { + "epoch": 1.52, + "grad_norm": 9.203086200392013, + "learning_rate": 5.754268742911037e-06, + "loss": 0.6078, + "step": 8940 + }, + { + "epoch": 1.52, + "grad_norm": 13.337291102785596, + "learning_rate": 5.749379450001693e-06, + "loss": 0.6291, + "step": 8945 + }, + { + "epoch": 1.52, + "grad_norm": 6.130884826076313, + "learning_rate": 5.74448942395422e-06, + "loss": 0.615, + "step": 8950 + }, + { + "epoch": 1.52, + "grad_norm": 7.161302708063478, + "learning_rate": 5.739598669552664e-06, + "loss": 0.6025, + "step": 8955 + }, + { + "epoch": 1.52, + "grad_norm": 10.614457781504585, + "learning_rate": 5.73470719158178e-06, + "loss": 0.5931, + "step": 8960 + }, + { + "epoch": 1.52, + "grad_norm": 16.727423296416706, + "learning_rate": 5.729814994827034e-06, + "loss": 0.6256, + "step": 8965 + }, + { + "epoch": 1.52, + "grad_norm": 20.883727699858422, + "learning_rate": 5.724922084074595e-06, + "loss": 0.6195, + "step": 8970 + }, + { + "epoch": 1.53, + "grad_norm": 12.987766905601958, + "learning_rate": 5.720028464111326e-06, + "loss": 0.6148, + "step": 8975 + }, + { + "epoch": 1.53, + "grad_norm": 7.839778190277152, + "learning_rate": 5.715134139724792e-06, + "loss": 0.5994, + "step": 8980 + }, + { + "epoch": 1.53, + "grad_norm": 15.502188480226746, + "learning_rate": 5.710239115703238e-06, + "loss": 0.5999, + "step": 8985 + }, + { + "epoch": 1.53, + "grad_norm": 16.162420148029522, + "learning_rate": 5.705343396835602e-06, + "loss": 0.6065, + "step": 8990 + }, + { + "epoch": 1.53, + "grad_norm": 16.526083835319188, + "learning_rate": 5.7004469879114955e-06, + "loss": 0.6127, + "step": 8995 + }, + { + "epoch": 1.53, + "grad_norm": 13.161343365344532, + "learning_rate": 5.6955498937212074e-06, + "loss": 0.6183, + "step": 9000 + }, + { + "epoch": 1.53, + "grad_norm": 13.014971463269138, + "learning_rate": 5.6906521190556976e-06, + "loss": 0.6128, + "step": 9005 + }, + { + "epoch": 1.53, + "grad_norm": 9.317961414655818, + "learning_rate": 5.685753668706591e-06, + "loss": 0.6106, + "step": 9010 + }, + { + "epoch": 1.53, + "grad_norm": 8.993800769863125, + "learning_rate": 5.680854547466174e-06, + "loss": 0.6173, + "step": 9015 + }, + { + "epoch": 1.53, + "grad_norm": 18.032312729410066, + "learning_rate": 5.67595476012739e-06, + "loss": 0.6204, + "step": 9020 + }, + { + "epoch": 1.53, + "grad_norm": 6.913492906194817, + "learning_rate": 5.671054311483833e-06, + "loss": 0.5956, + "step": 9025 + }, + { + "epoch": 1.54, + "grad_norm": 12.011346666019541, + "learning_rate": 5.66615320632974e-06, + "loss": 0.6032, + "step": 9030 + }, + { + "epoch": 1.54, + "grad_norm": 7.120145904095579, + "learning_rate": 5.66125144946e-06, + "loss": 0.6298, + "step": 9035 + }, + { + "epoch": 1.54, + "grad_norm": 9.41274470876883, + "learning_rate": 5.6563490456701296e-06, + "loss": 0.587, + "step": 9040 + }, + { + "epoch": 1.54, + "grad_norm": 10.915343026406369, + "learning_rate": 5.6514459997562855e-06, + "loss": 0.6216, + "step": 9045 + }, + { + "epoch": 1.54, + "grad_norm": 5.895671288900355, + "learning_rate": 5.64654231651525e-06, + "loss": 0.6183, + "step": 9050 + }, + { + "epoch": 1.54, + "grad_norm": 23.881102306393533, + "learning_rate": 5.641638000744425e-06, + "loss": 0.6228, + "step": 9055 + }, + { + "epoch": 1.54, + "grad_norm": 20.546332683774185, + "learning_rate": 5.636733057241839e-06, + "loss": 0.6191, + "step": 9060 + }, + { + "epoch": 1.54, + "grad_norm": 14.134020839761392, + "learning_rate": 5.631827490806128e-06, + "loss": 0.5941, + "step": 9065 + }, + { + "epoch": 1.54, + "grad_norm": 10.355455888132816, + "learning_rate": 5.626921306236541e-06, + "loss": 0.5877, + "step": 9070 + }, + { + "epoch": 1.54, + "grad_norm": 30.435020932897146, + "learning_rate": 5.622014508332932e-06, + "loss": 0.6148, + "step": 9075 + }, + { + "epoch": 1.54, + "grad_norm": 16.35724195784234, + "learning_rate": 5.617107101895751e-06, + "loss": 0.6164, + "step": 9080 + }, + { + "epoch": 1.54, + "grad_norm": 7.154665214652813, + "learning_rate": 5.6121990917260474e-06, + "loss": 0.6191, + "step": 9085 + }, + { + "epoch": 1.55, + "grad_norm": 16.512763141117652, + "learning_rate": 5.607290482625461e-06, + "loss": 0.6041, + "step": 9090 + }, + { + "epoch": 1.55, + "grad_norm": 7.269055262135025, + "learning_rate": 5.602381279396213e-06, + "loss": 0.6069, + "step": 9095 + }, + { + "epoch": 1.55, + "grad_norm": 18.536300771361045, + "learning_rate": 5.5974714868411125e-06, + "loss": 0.6053, + "step": 9100 + }, + { + "epoch": 1.55, + "grad_norm": 10.77546270410342, + "learning_rate": 5.592561109763542e-06, + "loss": 0.6037, + "step": 9105 + }, + { + "epoch": 1.55, + "grad_norm": 13.052644342202596, + "learning_rate": 5.587650152967454e-06, + "loss": 0.609, + "step": 9110 + }, + { + "epoch": 1.55, + "grad_norm": 11.336728354774031, + "learning_rate": 5.582738621257372e-06, + "loss": 0.6046, + "step": 9115 + }, + { + "epoch": 1.55, + "grad_norm": 6.189860501039402, + "learning_rate": 5.57782651943838e-06, + "loss": 0.6294, + "step": 9120 + }, + { + "epoch": 1.55, + "grad_norm": 15.104760202629917, + "learning_rate": 5.572913852316116e-06, + "loss": 0.6019, + "step": 9125 + }, + { + "epoch": 1.55, + "grad_norm": 10.372469587828709, + "learning_rate": 5.56800062469678e-06, + "loss": 0.5982, + "step": 9130 + }, + { + "epoch": 1.55, + "grad_norm": 26.10833699620634, + "learning_rate": 5.563086841387111e-06, + "loss": 0.5945, + "step": 9135 + }, + { + "epoch": 1.55, + "grad_norm": 5.931729991395199, + "learning_rate": 5.558172507194397e-06, + "loss": 0.6159, + "step": 9140 + }, + { + "epoch": 1.55, + "grad_norm": 9.704437918283011, + "learning_rate": 5.5532576269264635e-06, + "loss": 0.6167, + "step": 9145 + }, + { + "epoch": 1.56, + "grad_norm": 9.00692493270044, + "learning_rate": 5.5483422053916735e-06, + "loss": 0.5915, + "step": 9150 + }, + { + "epoch": 1.56, + "grad_norm": 9.21951340821045, + "learning_rate": 5.543426247398912e-06, + "loss": 0.5967, + "step": 9155 + }, + { + "epoch": 1.56, + "grad_norm": 5.8956292819034815, + "learning_rate": 5.538509757757594e-06, + "loss": 0.6109, + "step": 9160 + }, + { + "epoch": 1.56, + "grad_norm": 7.693620673196478, + "learning_rate": 5.533592741277658e-06, + "loss": 0.6159, + "step": 9165 + }, + { + "epoch": 1.56, + "grad_norm": 8.139425257113912, + "learning_rate": 5.528675202769549e-06, + "loss": 0.5993, + "step": 9170 + }, + { + "epoch": 1.56, + "grad_norm": 9.863801913245817, + "learning_rate": 5.52375714704423e-06, + "loss": 0.6, + "step": 9175 + }, + { + "epoch": 1.56, + "grad_norm": 18.185617291207205, + "learning_rate": 5.518838578913167e-06, + "loss": 0.6063, + "step": 9180 + }, + { + "epoch": 1.56, + "grad_norm": 7.2860494408934295, + "learning_rate": 5.513919503188328e-06, + "loss": 0.591, + "step": 9185 + }, + { + "epoch": 1.56, + "grad_norm": 34.38305181082742, + "learning_rate": 5.508999924682178e-06, + "loss": 0.6076, + "step": 9190 + }, + { + "epoch": 1.56, + "grad_norm": 22.92435759386956, + "learning_rate": 5.504079848207671e-06, + "loss": 0.6167, + "step": 9195 + }, + { + "epoch": 1.56, + "grad_norm": 12.760529664653179, + "learning_rate": 5.499159278578253e-06, + "loss": 0.6014, + "step": 9200 + }, + { + "epoch": 1.56, + "grad_norm": 13.628714969215856, + "learning_rate": 5.4942382206078495e-06, + "loss": 0.5977, + "step": 9205 + }, + { + "epoch": 1.57, + "grad_norm": 16.656187450738024, + "learning_rate": 5.489316679110864e-06, + "loss": 0.5956, + "step": 9210 + }, + { + "epoch": 1.57, + "grad_norm": 7.2821838796463245, + "learning_rate": 5.484394658902173e-06, + "loss": 0.6061, + "step": 9215 + }, + { + "epoch": 1.57, + "grad_norm": 19.831834598197847, + "learning_rate": 5.479472164797124e-06, + "loss": 0.5959, + "step": 9220 + }, + { + "epoch": 1.57, + "grad_norm": 10.214689267998624, + "learning_rate": 5.474549201611521e-06, + "loss": 0.6063, + "step": 9225 + }, + { + "epoch": 1.57, + "grad_norm": 6.16812247258966, + "learning_rate": 5.4696257741616366e-06, + "loss": 0.5894, + "step": 9230 + }, + { + "epoch": 1.57, + "grad_norm": 6.923420658187072, + "learning_rate": 5.464701887264188e-06, + "loss": 0.6208, + "step": 9235 + }, + { + "epoch": 1.57, + "grad_norm": 16.284791283442523, + "learning_rate": 5.45977754573635e-06, + "loss": 0.6002, + "step": 9240 + }, + { + "epoch": 1.57, + "grad_norm": 9.352219537142107, + "learning_rate": 5.454852754395738e-06, + "loss": 0.5926, + "step": 9245 + }, + { + "epoch": 1.57, + "grad_norm": 7.119697378052444, + "learning_rate": 5.449927518060407e-06, + "loss": 0.5869, + "step": 9250 + }, + { + "epoch": 1.57, + "grad_norm": 9.266374499933223, + "learning_rate": 5.44500184154885e-06, + "loss": 0.5999, + "step": 9255 + }, + { + "epoch": 1.57, + "grad_norm": 7.297673633134746, + "learning_rate": 5.44007572967999e-06, + "loss": 0.5976, + "step": 9260 + }, + { + "epoch": 1.58, + "grad_norm": 21.96269904481905, + "learning_rate": 5.435149187273172e-06, + "loss": 0.6065, + "step": 9265 + }, + { + "epoch": 1.58, + "grad_norm": 7.492653965184815, + "learning_rate": 5.430222219148168e-06, + "loss": 0.5974, + "step": 9270 + }, + { + "epoch": 1.58, + "grad_norm": 23.22740569704164, + "learning_rate": 5.4252948301251615e-06, + "loss": 0.5934, + "step": 9275 + }, + { + "epoch": 1.58, + "grad_norm": 8.173788220397405, + "learning_rate": 5.420367025024753e-06, + "loss": 0.6119, + "step": 9280 + }, + { + "epoch": 1.58, + "grad_norm": 6.937326719215955, + "learning_rate": 5.415438808667944e-06, + "loss": 0.6035, + "step": 9285 + }, + { + "epoch": 1.58, + "grad_norm": 25.668499087828007, + "learning_rate": 5.410510185876146e-06, + "loss": 0.6221, + "step": 9290 + }, + { + "epoch": 1.58, + "grad_norm": 8.964394347664827, + "learning_rate": 5.405581161471157e-06, + "loss": 0.6056, + "step": 9295 + }, + { + "epoch": 1.58, + "grad_norm": 6.995289947861624, + "learning_rate": 5.40065174027518e-06, + "loss": 0.5984, + "step": 9300 + }, + { + "epoch": 1.58, + "grad_norm": 7.214810192286005, + "learning_rate": 5.3957219271108e-06, + "loss": 0.5886, + "step": 9305 + }, + { + "epoch": 1.58, + "grad_norm": 11.989591849187818, + "learning_rate": 5.390791726800983e-06, + "loss": 0.6138, + "step": 9310 + }, + { + "epoch": 1.58, + "grad_norm": 8.808671808767826, + "learning_rate": 5.385861144169081e-06, + "loss": 0.6074, + "step": 9315 + }, + { + "epoch": 1.58, + "grad_norm": 7.561526838673997, + "learning_rate": 5.3809301840388126e-06, + "loss": 0.5923, + "step": 9320 + }, + { + "epoch": 1.59, + "grad_norm": 5.999545774089387, + "learning_rate": 5.375998851234272e-06, + "loss": 0.6118, + "step": 9325 + }, + { + "epoch": 1.59, + "grad_norm": 18.190251238079696, + "learning_rate": 5.371067150579912e-06, + "loss": 0.5854, + "step": 9330 + }, + { + "epoch": 1.59, + "grad_norm": 9.054453094014715, + "learning_rate": 5.366135086900552e-06, + "loss": 0.5986, + "step": 9335 + }, + { + "epoch": 1.59, + "grad_norm": 18.541785491546758, + "learning_rate": 5.361202665021359e-06, + "loss": 0.6017, + "step": 9340 + }, + { + "epoch": 1.59, + "grad_norm": 15.566548150106643, + "learning_rate": 5.356269889767857e-06, + "loss": 0.5875, + "step": 9345 + }, + { + "epoch": 1.59, + "grad_norm": 8.093235129596696, + "learning_rate": 5.351336765965913e-06, + "loss": 0.6059, + "step": 9350 + }, + { + "epoch": 1.59, + "grad_norm": 16.523942808346206, + "learning_rate": 5.3464032984417345e-06, + "loss": 0.5845, + "step": 9355 + }, + { + "epoch": 1.59, + "grad_norm": 9.225232390971318, + "learning_rate": 5.341469492021866e-06, + "loss": 0.5858, + "step": 9360 + }, + { + "epoch": 1.59, + "grad_norm": 10.56203769259536, + "learning_rate": 5.336535351533182e-06, + "loss": 0.5957, + "step": 9365 + }, + { + "epoch": 1.59, + "grad_norm": 8.809870152961832, + "learning_rate": 5.331600881802887e-06, + "loss": 0.5988, + "step": 9370 + }, + { + "epoch": 1.59, + "grad_norm": 8.809022292311157, + "learning_rate": 5.326666087658505e-06, + "loss": 0.607, + "step": 9375 + }, + { + "epoch": 1.59, + "grad_norm": 12.171141932866165, + "learning_rate": 5.321730973927879e-06, + "loss": 0.6137, + "step": 9380 + }, + { + "epoch": 1.6, + "grad_norm": 6.533028033705287, + "learning_rate": 5.316795545439162e-06, + "loss": 0.5943, + "step": 9385 + }, + { + "epoch": 1.6, + "grad_norm": 11.502217924497344, + "learning_rate": 5.31185980702082e-06, + "loss": 0.5949, + "step": 9390 + }, + { + "epoch": 1.6, + "grad_norm": 6.901020453643195, + "learning_rate": 5.306923763501616e-06, + "loss": 0.6005, + "step": 9395 + }, + { + "epoch": 1.6, + "grad_norm": 19.593915315383, + "learning_rate": 5.301987419710617e-06, + "loss": 0.6089, + "step": 9400 + }, + { + "epoch": 1.6, + "grad_norm": 9.3117371932334, + "learning_rate": 5.297050780477179e-06, + "loss": 0.5964, + "step": 9405 + }, + { + "epoch": 1.6, + "grad_norm": 8.826971591181108, + "learning_rate": 5.29211385063095e-06, + "loss": 0.592, + "step": 9410 + }, + { + "epoch": 1.6, + "grad_norm": 31.45574299091673, + "learning_rate": 5.287176635001863e-06, + "loss": 0.6082, + "step": 9415 + }, + { + "epoch": 1.6, + "grad_norm": 15.088086236450218, + "learning_rate": 5.282239138420127e-06, + "loss": 0.5993, + "step": 9420 + }, + { + "epoch": 1.6, + "grad_norm": 42.54559890377032, + "learning_rate": 5.277301365716228e-06, + "loss": 0.6037, + "step": 9425 + }, + { + "epoch": 1.6, + "grad_norm": 6.003789314693376, + "learning_rate": 5.272363321720926e-06, + "loss": 0.6034, + "step": 9430 + }, + { + "epoch": 1.6, + "grad_norm": 36.70258270216636, + "learning_rate": 5.267425011265239e-06, + "loss": 0.6, + "step": 9435 + }, + { + "epoch": 1.6, + "grad_norm": 20.528336862629228, + "learning_rate": 5.26248643918045e-06, + "loss": 0.5939, + "step": 9440 + }, + { + "epoch": 1.61, + "grad_norm": 26.2131550907509, + "learning_rate": 5.2575476102980995e-06, + "loss": 0.5955, + "step": 9445 + }, + { + "epoch": 1.61, + "grad_norm": 29.015949407973547, + "learning_rate": 5.252608529449973e-06, + "loss": 0.5867, + "step": 9450 + }, + { + "epoch": 1.61, + "grad_norm": 5.981498137225529, + "learning_rate": 5.2476692014681095e-06, + "loss": 0.6001, + "step": 9455 + }, + { + "epoch": 1.61, + "grad_norm": 28.895944965854035, + "learning_rate": 5.242729631184786e-06, + "loss": 0.6044, + "step": 9460 + }, + { + "epoch": 1.61, + "grad_norm": 11.041760611777507, + "learning_rate": 5.237789823432517e-06, + "loss": 0.5943, + "step": 9465 + }, + { + "epoch": 1.61, + "grad_norm": 8.521764242332235, + "learning_rate": 5.232849783044052e-06, + "loss": 0.5826, + "step": 9470 + }, + { + "epoch": 1.61, + "grad_norm": 10.398708372866475, + "learning_rate": 5.227909514852361e-06, + "loss": 0.607, + "step": 9475 + }, + { + "epoch": 1.61, + "grad_norm": 20.215730658188253, + "learning_rate": 5.222969023690645e-06, + "loss": 0.5931, + "step": 9480 + }, + { + "epoch": 1.61, + "grad_norm": 8.599945335944277, + "learning_rate": 5.218028314392318e-06, + "loss": 0.6104, + "step": 9485 + }, + { + "epoch": 1.61, + "grad_norm": 7.166393268974675, + "learning_rate": 5.213087391791013e-06, + "loss": 0.5799, + "step": 9490 + }, + { + "epoch": 1.61, + "grad_norm": 10.588862984883596, + "learning_rate": 5.208146260720565e-06, + "loss": 0.5951, + "step": 9495 + }, + { + "epoch": 1.62, + "grad_norm": 17.894396498559342, + "learning_rate": 5.203204926015014e-06, + "loss": 0.5937, + "step": 9500 + }, + { + "epoch": 1.62, + "grad_norm": 27.202565212697937, + "learning_rate": 5.1982633925086035e-06, + "loss": 0.5935, + "step": 9505 + }, + { + "epoch": 1.62, + "grad_norm": 23.281425496642882, + "learning_rate": 5.1933216650357685e-06, + "loss": 0.5886, + "step": 9510 + }, + { + "epoch": 1.62, + "grad_norm": 22.44444843338854, + "learning_rate": 5.188379748431135e-06, + "loss": 0.6094, + "step": 9515 + }, + { + "epoch": 1.62, + "grad_norm": 10.051494013021621, + "learning_rate": 5.1834376475295126e-06, + "loss": 0.5863, + "step": 9520 + }, + { + "epoch": 1.62, + "grad_norm": 13.514138256595807, + "learning_rate": 5.17849536716589e-06, + "loss": 0.5763, + "step": 9525 + }, + { + "epoch": 1.62, + "grad_norm": 7.214611501380019, + "learning_rate": 5.173552912175437e-06, + "loss": 0.5877, + "step": 9530 + }, + { + "epoch": 1.62, + "grad_norm": 7.452926704996535, + "learning_rate": 5.168610287393489e-06, + "loss": 0.5908, + "step": 9535 + }, + { + "epoch": 1.62, + "grad_norm": 9.43734189421902, + "learning_rate": 5.163667497655549e-06, + "loss": 0.5944, + "step": 9540 + }, + { + "epoch": 1.62, + "grad_norm": 5.50824964629719, + "learning_rate": 5.15872454779728e-06, + "loss": 0.5913, + "step": 9545 + }, + { + "epoch": 1.62, + "grad_norm": 6.772975708340726, + "learning_rate": 5.153781442654505e-06, + "loss": 0.5899, + "step": 9550 + }, + { + "epoch": 1.62, + "grad_norm": 5.791507395852812, + "learning_rate": 5.148838187063199e-06, + "loss": 0.5664, + "step": 9555 + }, + { + "epoch": 1.63, + "grad_norm": 9.19245147769233, + "learning_rate": 5.143894785859478e-06, + "loss": 0.579, + "step": 9560 + }, + { + "epoch": 1.63, + "grad_norm": 7.188990357721228, + "learning_rate": 5.138951243879608e-06, + "loss": 0.5956, + "step": 9565 + }, + { + "epoch": 1.63, + "grad_norm": 6.134172803566005, + "learning_rate": 5.134007565959986e-06, + "loss": 0.5759, + "step": 9570 + }, + { + "epoch": 1.63, + "grad_norm": 8.083896517270064, + "learning_rate": 5.1290637569371504e-06, + "loss": 0.5877, + "step": 9575 + }, + { + "epoch": 1.63, + "grad_norm": 19.32150938461579, + "learning_rate": 5.124119821647759e-06, + "loss": 0.5677, + "step": 9580 + }, + { + "epoch": 1.63, + "grad_norm": 7.6213419594999525, + "learning_rate": 5.119175764928599e-06, + "loss": 0.5851, + "step": 9585 + }, + { + "epoch": 1.63, + "grad_norm": 7.008101960171448, + "learning_rate": 5.114231591616573e-06, + "loss": 0.5703, + "step": 9590 + }, + { + "epoch": 1.63, + "grad_norm": 5.606662403436859, + "learning_rate": 5.1092873065487e-06, + "loss": 0.5754, + "step": 9595 + }, + { + "epoch": 1.63, + "grad_norm": 13.279716551224174, + "learning_rate": 5.104342914562107e-06, + "loss": 0.5706, + "step": 9600 + }, + { + "epoch": 1.63, + "grad_norm": 6.533706059181452, + "learning_rate": 5.0993984204940265e-06, + "loss": 0.5842, + "step": 9605 + }, + { + "epoch": 1.63, + "grad_norm": 27.84735675215916, + "learning_rate": 5.0944538291817904e-06, + "loss": 0.569, + "step": 9610 + }, + { + "epoch": 1.63, + "grad_norm": 5.401334890929006, + "learning_rate": 5.089509145462825e-06, + "loss": 0.5832, + "step": 9615 + }, + { + "epoch": 1.64, + "grad_norm": 10.823274768680948, + "learning_rate": 5.084564374174649e-06, + "loss": 0.5911, + "step": 9620 + }, + { + "epoch": 1.64, + "grad_norm": 8.325720202355667, + "learning_rate": 5.079619520154865e-06, + "loss": 0.5927, + "step": 9625 + }, + { + "epoch": 1.64, + "grad_norm": 12.803797542503668, + "learning_rate": 5.074674588241157e-06, + "loss": 0.5723, + "step": 9630 + }, + { + "epoch": 1.64, + "grad_norm": 7.970088435158851, + "learning_rate": 5.069729583271285e-06, + "loss": 0.5959, + "step": 9635 + }, + { + "epoch": 1.64, + "grad_norm": 24.213551346003864, + "learning_rate": 5.0647845100830805e-06, + "loss": 0.5713, + "step": 9640 + }, + { + "epoch": 1.64, + "grad_norm": 11.224768652073715, + "learning_rate": 5.059839373514441e-06, + "loss": 0.5854, + "step": 9645 + }, + { + "epoch": 1.64, + "grad_norm": 5.8733589879068955, + "learning_rate": 5.05489417840333e-06, + "loss": 0.5768, + "step": 9650 + }, + { + "epoch": 1.64, + "grad_norm": 8.377198052908513, + "learning_rate": 5.049948929587764e-06, + "loss": 0.5836, + "step": 9655 + }, + { + "epoch": 1.64, + "grad_norm": 8.73180875581141, + "learning_rate": 5.045003631905813e-06, + "loss": 0.5842, + "step": 9660 + }, + { + "epoch": 1.64, + "grad_norm": 25.008213493513512, + "learning_rate": 5.040058290195594e-06, + "loss": 0.5947, + "step": 9665 + }, + { + "epoch": 1.64, + "grad_norm": 6.41992480953314, + "learning_rate": 5.0351129092952685e-06, + "loss": 0.5729, + "step": 9670 + }, + { + "epoch": 1.64, + "grad_norm": 10.591693362525524, + "learning_rate": 5.030167494043039e-06, + "loss": 0.5941, + "step": 9675 + }, + { + "epoch": 1.65, + "grad_norm": 13.601293521527685, + "learning_rate": 5.025222049277136e-06, + "loss": 0.5862, + "step": 9680 + }, + { + "epoch": 1.65, + "grad_norm": 7.893404495312224, + "learning_rate": 5.020276579835821e-06, + "loss": 0.5965, + "step": 9685 + }, + { + "epoch": 1.65, + "grad_norm": 13.84532963343138, + "learning_rate": 5.0153310905573815e-06, + "loss": 0.5938, + "step": 9690 + }, + { + "epoch": 1.65, + "grad_norm": 7.06878354567659, + "learning_rate": 5.0103855862801235e-06, + "loss": 0.5921, + "step": 9695 + }, + { + "epoch": 1.65, + "grad_norm": 31.63918706961253, + "learning_rate": 5.005440071842365e-06, + "loss": 0.5951, + "step": 9700 + }, + { + "epoch": 1.65, + "grad_norm": 12.578560024309926, + "learning_rate": 5.000494552082437e-06, + "loss": 0.5893, + "step": 9705 + }, + { + "epoch": 1.65, + "grad_norm": 24.101825694531986, + "learning_rate": 4.995549031838675e-06, + "loss": 0.583, + "step": 9710 + }, + { + "epoch": 1.65, + "grad_norm": 29.86430102484688, + "learning_rate": 4.990603515949416e-06, + "loss": 0.5828, + "step": 9715 + }, + { + "epoch": 1.65, + "grad_norm": 10.822902890432811, + "learning_rate": 4.985658009252992e-06, + "loss": 0.5617, + "step": 9720 + }, + { + "epoch": 1.65, + "grad_norm": 26.90223147024694, + "learning_rate": 4.980712516587724e-06, + "loss": 0.5908, + "step": 9725 + }, + { + "epoch": 1.65, + "grad_norm": 34.950449323343015, + "learning_rate": 4.975767042791921e-06, + "loss": 0.5766, + "step": 9730 + }, + { + "epoch": 1.66, + "grad_norm": 12.257839463549047, + "learning_rate": 4.970821592703874e-06, + "loss": 0.5859, + "step": 9735 + }, + { + "epoch": 1.66, + "grad_norm": 8.386917175444268, + "learning_rate": 4.965876171161848e-06, + "loss": 0.5797, + "step": 9740 + }, + { + "epoch": 1.66, + "grad_norm": 22.194715608074517, + "learning_rate": 4.960930783004085e-06, + "loss": 0.5698, + "step": 9745 + }, + { + "epoch": 1.66, + "grad_norm": 21.670717750103282, + "learning_rate": 4.955985433068791e-06, + "loss": 0.5858, + "step": 9750 + }, + { + "epoch": 1.66, + "grad_norm": 24.122686284627108, + "learning_rate": 4.951040126194135e-06, + "loss": 0.5921, + "step": 9755 + }, + { + "epoch": 1.66, + "grad_norm": 16.764344436833607, + "learning_rate": 4.946094867218243e-06, + "loss": 0.5677, + "step": 9760 + }, + { + "epoch": 1.66, + "grad_norm": 19.31689565138828, + "learning_rate": 4.941149660979201e-06, + "loss": 0.5904, + "step": 9765 + }, + { + "epoch": 1.66, + "grad_norm": 11.942338302320184, + "learning_rate": 4.936204512315029e-06, + "loss": 0.5971, + "step": 9770 + }, + { + "epoch": 1.66, + "grad_norm": 11.9944235406172, + "learning_rate": 4.931259426063704e-06, + "loss": 0.5843, + "step": 9775 + }, + { + "epoch": 1.66, + "grad_norm": 21.039797821647916, + "learning_rate": 4.926314407063136e-06, + "loss": 0.577, + "step": 9780 + }, + { + "epoch": 1.66, + "grad_norm": 12.865950004267344, + "learning_rate": 4.9213694601511714e-06, + "loss": 0.5809, + "step": 9785 + }, + { + "epoch": 1.66, + "grad_norm": 7.363302596636814, + "learning_rate": 4.9164245901655845e-06, + "loss": 0.5825, + "step": 9790 + }, + { + "epoch": 1.67, + "grad_norm": 7.5785200823388505, + "learning_rate": 4.911479801944076e-06, + "loss": 0.5878, + "step": 9795 + }, + { + "epoch": 1.67, + "grad_norm": 29.945610829274326, + "learning_rate": 4.906535100324264e-06, + "loss": 0.5877, + "step": 9800 + }, + { + "epoch": 1.67, + "grad_norm": 7.813855108965385, + "learning_rate": 4.901590490143686e-06, + "loss": 0.5727, + "step": 9805 + }, + { + "epoch": 1.67, + "grad_norm": 6.498703494352979, + "learning_rate": 4.896645976239785e-06, + "loss": 0.5813, + "step": 9810 + }, + { + "epoch": 1.67, + "grad_norm": 16.94551286609573, + "learning_rate": 4.8917015634499125e-06, + "loss": 0.5937, + "step": 9815 + }, + { + "epoch": 1.67, + "grad_norm": 6.006180458225585, + "learning_rate": 4.886757256611323e-06, + "loss": 0.5783, + "step": 9820 + }, + { + "epoch": 1.67, + "grad_norm": 10.992668492922249, + "learning_rate": 4.881813060561162e-06, + "loss": 0.5793, + "step": 9825 + }, + { + "epoch": 1.67, + "grad_norm": 7.09319164500253, + "learning_rate": 4.876868980136472e-06, + "loss": 0.5774, + "step": 9830 + }, + { + "epoch": 1.67, + "grad_norm": 19.408392042835462, + "learning_rate": 4.87192502017418e-06, + "loss": 0.5756, + "step": 9835 + }, + { + "epoch": 1.67, + "grad_norm": 17.818354113938415, + "learning_rate": 4.866981185511095e-06, + "loss": 0.5755, + "step": 9840 + }, + { + "epoch": 1.67, + "grad_norm": 8.504576142845876, + "learning_rate": 4.862037480983906e-06, + "loss": 0.5762, + "step": 9845 + }, + { + "epoch": 1.67, + "grad_norm": 21.296112438487995, + "learning_rate": 4.857093911429169e-06, + "loss": 0.5928, + "step": 9850 + }, + { + "epoch": 1.68, + "grad_norm": 17.99023787677679, + "learning_rate": 4.852150481683313e-06, + "loss": 0.5809, + "step": 9855 + }, + { + "epoch": 1.68, + "grad_norm": 6.901437789610188, + "learning_rate": 4.847207196582628e-06, + "loss": 0.5883, + "step": 9860 + }, + { + "epoch": 1.68, + "grad_norm": 14.855858691741162, + "learning_rate": 4.842264060963265e-06, + "loss": 0.5962, + "step": 9865 + }, + { + "epoch": 1.68, + "grad_norm": 27.968729321502533, + "learning_rate": 4.837321079661225e-06, + "loss": 0.5897, + "step": 9870 + }, + { + "epoch": 1.68, + "grad_norm": 10.75320400924953, + "learning_rate": 4.83237825751236e-06, + "loss": 0.5836, + "step": 9875 + }, + { + "epoch": 1.68, + "grad_norm": 9.5378797267475, + "learning_rate": 4.827435599352367e-06, + "loss": 0.5679, + "step": 9880 + }, + { + "epoch": 1.68, + "grad_norm": 26.342093199593943, + "learning_rate": 4.822493110016785e-06, + "loss": 0.5743, + "step": 9885 + }, + { + "epoch": 1.68, + "grad_norm": 5.642984156198046, + "learning_rate": 4.817550794340977e-06, + "loss": 0.5702, + "step": 9890 + }, + { + "epoch": 1.68, + "grad_norm": 13.668599007883666, + "learning_rate": 4.812608657160149e-06, + "loss": 0.5739, + "step": 9895 + }, + { + "epoch": 1.68, + "grad_norm": 10.331624056006664, + "learning_rate": 4.807666703309327e-06, + "loss": 0.5671, + "step": 9900 + }, + { + "epoch": 1.68, + "grad_norm": 25.002481425875065, + "learning_rate": 4.802724937623355e-06, + "loss": 0.5769, + "step": 9905 + }, + { + "epoch": 1.68, + "grad_norm": 18.291991609730832, + "learning_rate": 4.7977833649369e-06, + "loss": 0.5797, + "step": 9910 + }, + { + "epoch": 1.69, + "grad_norm": 9.706725759701401, + "learning_rate": 4.7928419900844316e-06, + "loss": 0.5755, + "step": 9915 + }, + { + "epoch": 1.69, + "grad_norm": 14.17580923262539, + "learning_rate": 4.787900817900232e-06, + "loss": 0.5749, + "step": 9920 + }, + { + "epoch": 1.69, + "grad_norm": 7.019499640302362, + "learning_rate": 4.782959853218386e-06, + "loss": 0.5819, + "step": 9925 + }, + { + "epoch": 1.69, + "grad_norm": 13.759696126688855, + "learning_rate": 4.778019100872767e-06, + "loss": 0.5792, + "step": 9930 + }, + { + "epoch": 1.69, + "grad_norm": 26.831148200227613, + "learning_rate": 4.773078565697048e-06, + "loss": 0.5565, + "step": 9935 + }, + { + "epoch": 1.69, + "grad_norm": 6.270356675527533, + "learning_rate": 4.76813825252469e-06, + "loss": 0.5843, + "step": 9940 + }, + { + "epoch": 1.69, + "grad_norm": 12.231540027130091, + "learning_rate": 4.763198166188933e-06, + "loss": 0.5774, + "step": 9945 + }, + { + "epoch": 1.69, + "grad_norm": 7.9990434864943545, + "learning_rate": 4.758258311522798e-06, + "loss": 0.5961, + "step": 9950 + }, + { + "epoch": 1.69, + "grad_norm": 5.583122420995502, + "learning_rate": 4.7533186933590766e-06, + "loss": 0.5779, + "step": 9955 + }, + { + "epoch": 1.69, + "grad_norm": 12.375315990982129, + "learning_rate": 4.748379316530331e-06, + "loss": 0.5557, + "step": 9960 + }, + { + "epoch": 1.69, + "grad_norm": 33.351291331106125, + "learning_rate": 4.743440185868888e-06, + "loss": 0.5775, + "step": 9965 + }, + { + "epoch": 1.7, + "grad_norm": 6.461224217936112, + "learning_rate": 4.738501306206831e-06, + "loss": 0.5718, + "step": 9970 + }, + { + "epoch": 1.7, + "grad_norm": 20.359831655936695, + "learning_rate": 4.733562682375999e-06, + "loss": 0.5673, + "step": 9975 + }, + { + "epoch": 1.7, + "grad_norm": 12.654800710201974, + "learning_rate": 4.728624319207979e-06, + "loss": 0.5715, + "step": 9980 + }, + { + "epoch": 1.7, + "grad_norm": 11.9141816673019, + "learning_rate": 4.723686221534109e-06, + "loss": 0.5612, + "step": 9985 + }, + { + "epoch": 1.7, + "grad_norm": 21.915149604282366, + "learning_rate": 4.7187483941854615e-06, + "loss": 0.5743, + "step": 9990 + }, + { + "epoch": 1.7, + "grad_norm": 12.012368837366665, + "learning_rate": 4.713810841992845e-06, + "loss": 0.5684, + "step": 9995 + }, + { + "epoch": 1.7, + "grad_norm": 23.658206543376345, + "learning_rate": 4.708873569786803e-06, + "loss": 0.588, + "step": 10000 + }, + { + "epoch": 1.7, + "grad_norm": 12.610097088990205, + "learning_rate": 4.7039365823976e-06, + "loss": 0.557, + "step": 10005 + }, + { + "epoch": 1.7, + "grad_norm": 10.946014682934994, + "learning_rate": 4.6989998846552234e-06, + "loss": 0.5691, + "step": 10010 + }, + { + "epoch": 1.7, + "grad_norm": 5.491836666841432, + "learning_rate": 4.694063481389377e-06, + "loss": 0.5755, + "step": 10015 + }, + { + "epoch": 1.7, + "grad_norm": 12.281593897023107, + "learning_rate": 4.68912737742948e-06, + "loss": 0.5824, + "step": 10020 + }, + { + "epoch": 1.7, + "grad_norm": 6.680821452661381, + "learning_rate": 4.684191577604653e-06, + "loss": 0.5528, + "step": 10025 + }, + { + "epoch": 1.71, + "grad_norm": 8.924164351060437, + "learning_rate": 4.679256086743725e-06, + "loss": 0.5757, + "step": 10030 + }, + { + "epoch": 1.71, + "grad_norm": 6.566207423001996, + "learning_rate": 4.674320909675218e-06, + "loss": 0.5593, + "step": 10035 + }, + { + "epoch": 1.71, + "grad_norm": 9.428569933754627, + "learning_rate": 4.66938605122735e-06, + "loss": 0.5751, + "step": 10040 + }, + { + "epoch": 1.71, + "grad_norm": 16.645878410906143, + "learning_rate": 4.664451516228027e-06, + "loss": 0.5799, + "step": 10045 + }, + { + "epoch": 1.71, + "grad_norm": 14.705526960881357, + "learning_rate": 4.659517309504834e-06, + "loss": 0.5736, + "step": 10050 + }, + { + "epoch": 1.71, + "grad_norm": 17.44097547370709, + "learning_rate": 4.6545834358850415e-06, + "loss": 0.5816, + "step": 10055 + }, + { + "epoch": 1.71, + "grad_norm": 6.76221489972865, + "learning_rate": 4.649649900195591e-06, + "loss": 0.5735, + "step": 10060 + }, + { + "epoch": 1.71, + "grad_norm": 12.732878022187395, + "learning_rate": 4.644716707263091e-06, + "loss": 0.579, + "step": 10065 + }, + { + "epoch": 1.71, + "grad_norm": 5.972228567376378, + "learning_rate": 4.6397838619138205e-06, + "loss": 0.574, + "step": 10070 + }, + { + "epoch": 1.71, + "grad_norm": 11.090807111400876, + "learning_rate": 4.634851368973713e-06, + "loss": 0.5678, + "step": 10075 + }, + { + "epoch": 1.71, + "grad_norm": 17.469211038126204, + "learning_rate": 4.6299192332683605e-06, + "loss": 0.5764, + "step": 10080 + }, + { + "epoch": 1.71, + "grad_norm": 9.812271489373423, + "learning_rate": 4.6249874596230056e-06, + "loss": 0.5643, + "step": 10085 + }, + { + "epoch": 1.72, + "grad_norm": 14.540827875393065, + "learning_rate": 4.620056052862532e-06, + "loss": 0.5597, + "step": 10090 + }, + { + "epoch": 1.72, + "grad_norm": 9.747842561670529, + "learning_rate": 4.615125017811471e-06, + "loss": 0.5727, + "step": 10095 + }, + { + "epoch": 1.72, + "grad_norm": 6.5164902547307815, + "learning_rate": 4.6101943592939855e-06, + "loss": 0.5644, + "step": 10100 + }, + { + "epoch": 1.72, + "grad_norm": 27.124478750771498, + "learning_rate": 4.605264082133872e-06, + "loss": 0.5743, + "step": 10105 + }, + { + "epoch": 1.72, + "grad_norm": 8.513661237874295, + "learning_rate": 4.600334191154554e-06, + "loss": 0.555, + "step": 10110 + }, + { + "epoch": 1.72, + "grad_norm": 15.864940327869741, + "learning_rate": 4.595404691179077e-06, + "loss": 0.5547, + "step": 10115 + }, + { + "epoch": 1.72, + "grad_norm": 7.06766107265483, + "learning_rate": 4.5904755870301035e-06, + "loss": 0.5793, + "step": 10120 + }, + { + "epoch": 1.72, + "grad_norm": 8.085298717926536, + "learning_rate": 4.585546883529911e-06, + "loss": 0.578, + "step": 10125 + }, + { + "epoch": 1.72, + "grad_norm": 7.041429483784615, + "learning_rate": 4.5806185855003786e-06, + "loss": 0.5677, + "step": 10130 + }, + { + "epoch": 1.72, + "grad_norm": 5.799211328435109, + "learning_rate": 4.575690697762996e-06, + "loss": 0.5665, + "step": 10135 + }, + { + "epoch": 1.72, + "grad_norm": 17.448452244387205, + "learning_rate": 4.5707632251388484e-06, + "loss": 0.5753, + "step": 10140 + }, + { + "epoch": 1.72, + "grad_norm": 26.61989099099489, + "learning_rate": 4.5658361724486165e-06, + "loss": 0.5736, + "step": 10145 + }, + { + "epoch": 1.73, + "grad_norm": 7.459385864952437, + "learning_rate": 4.5609095445125665e-06, + "loss": 0.5675, + "step": 10150 + }, + { + "epoch": 1.73, + "grad_norm": 11.14015548413005, + "learning_rate": 4.555983346150551e-06, + "loss": 0.552, + "step": 10155 + }, + { + "epoch": 1.73, + "grad_norm": 17.124694141352393, + "learning_rate": 4.551057582182005e-06, + "loss": 0.5673, + "step": 10160 + }, + { + "epoch": 1.73, + "grad_norm": 22.63468479100616, + "learning_rate": 4.546132257425939e-06, + "loss": 0.5709, + "step": 10165 + }, + { + "epoch": 1.73, + "grad_norm": 17.668158526720532, + "learning_rate": 4.541207376700924e-06, + "loss": 0.5714, + "step": 10170 + }, + { + "epoch": 1.73, + "grad_norm": 9.798478426561271, + "learning_rate": 4.5362829448251076e-06, + "loss": 0.5721, + "step": 10175 + }, + { + "epoch": 1.73, + "grad_norm": 5.943369172881516, + "learning_rate": 4.5313589666161935e-06, + "loss": 0.5763, + "step": 10180 + }, + { + "epoch": 1.73, + "grad_norm": 5.960859334311488, + "learning_rate": 4.5264354468914425e-06, + "loss": 0.5775, + "step": 10185 + }, + { + "epoch": 1.73, + "grad_norm": 15.36159808951015, + "learning_rate": 4.521512390467668e-06, + "loss": 0.5545, + "step": 10190 + }, + { + "epoch": 1.73, + "grad_norm": 5.850169100527044, + "learning_rate": 4.516589802161228e-06, + "loss": 0.5657, + "step": 10195 + }, + { + "epoch": 1.73, + "grad_norm": 6.8472842610984985, + "learning_rate": 4.511667686788022e-06, + "loss": 0.5613, + "step": 10200 + }, + { + "epoch": 1.73, + "grad_norm": 6.381231330134624, + "learning_rate": 4.50674604916349e-06, + "loss": 0.5597, + "step": 10205 + }, + { + "epoch": 1.74, + "grad_norm": 5.380650887041463, + "learning_rate": 4.501824894102604e-06, + "loss": 0.5746, + "step": 10210 + }, + { + "epoch": 1.74, + "grad_norm": 6.9616226566722625, + "learning_rate": 4.49690422641986e-06, + "loss": 0.568, + "step": 10215 + }, + { + "epoch": 1.74, + "grad_norm": 7.128136408250823, + "learning_rate": 4.49198405092928e-06, + "loss": 0.5635, + "step": 10220 + }, + { + "epoch": 1.74, + "grad_norm": 7.7778297659247375, + "learning_rate": 4.487064372444406e-06, + "loss": 0.5661, + "step": 10225 + }, + { + "epoch": 1.74, + "grad_norm": 9.148750564471536, + "learning_rate": 4.4821451957782915e-06, + "loss": 0.559, + "step": 10230 + }, + { + "epoch": 1.74, + "grad_norm": 5.749591740727813, + "learning_rate": 4.4772265257435e-06, + "loss": 0.5698, + "step": 10235 + }, + { + "epoch": 1.74, + "grad_norm": 10.949616805563828, + "learning_rate": 4.472308367152098e-06, + "loss": 0.5518, + "step": 10240 + }, + { + "epoch": 1.74, + "grad_norm": 10.755053911637036, + "learning_rate": 4.467390724815654e-06, + "loss": 0.572, + "step": 10245 + }, + { + "epoch": 1.74, + "grad_norm": 7.60832823711757, + "learning_rate": 4.462473603545232e-06, + "loss": 0.5794, + "step": 10250 + }, + { + "epoch": 1.74, + "grad_norm": 14.883077773789456, + "learning_rate": 4.457557008151379e-06, + "loss": 0.5713, + "step": 10255 + }, + { + "epoch": 1.74, + "grad_norm": 8.519039378885232, + "learning_rate": 4.452640943444137e-06, + "loss": 0.5762, + "step": 10260 + }, + { + "epoch": 1.75, + "grad_norm": 6.12176426140937, + "learning_rate": 4.447725414233024e-06, + "loss": 0.5575, + "step": 10265 + }, + { + "epoch": 1.75, + "grad_norm": 9.597445584862303, + "learning_rate": 4.442810425327033e-06, + "loss": 0.5682, + "step": 10270 + }, + { + "epoch": 1.75, + "grad_norm": 15.899622508028797, + "learning_rate": 4.437895981534632e-06, + "loss": 0.5663, + "step": 10275 + }, + { + "epoch": 1.75, + "grad_norm": 15.23241821477259, + "learning_rate": 4.432982087663755e-06, + "loss": 0.5706, + "step": 10280 + }, + { + "epoch": 1.75, + "grad_norm": 10.064482315135152, + "learning_rate": 4.428068748521794e-06, + "loss": 0.5626, + "step": 10285 + }, + { + "epoch": 1.75, + "grad_norm": 7.224225338476222, + "learning_rate": 4.423155968915605e-06, + "loss": 0.5682, + "step": 10290 + }, + { + "epoch": 1.75, + "grad_norm": 9.104061848723966, + "learning_rate": 4.418243753651488e-06, + "loss": 0.5513, + "step": 10295 + }, + { + "epoch": 1.75, + "grad_norm": 7.6983397688120485, + "learning_rate": 4.413332107535199e-06, + "loss": 0.575, + "step": 10300 + }, + { + "epoch": 1.75, + "grad_norm": 9.031076235814748, + "learning_rate": 4.408421035371932e-06, + "loss": 0.5583, + "step": 10305 + }, + { + "epoch": 1.75, + "grad_norm": 23.865179039994, + "learning_rate": 4.4035105419663234e-06, + "loss": 0.5685, + "step": 10310 + }, + { + "epoch": 1.75, + "grad_norm": 9.3402880658064, + "learning_rate": 4.39860063212244e-06, + "loss": 0.5672, + "step": 10315 + }, + { + "epoch": 1.75, + "grad_norm": 8.376246146568219, + "learning_rate": 4.393691310643779e-06, + "loss": 0.5549, + "step": 10320 + }, + { + "epoch": 1.76, + "grad_norm": 7.5076088706963535, + "learning_rate": 4.388782582333263e-06, + "loss": 0.5456, + "step": 10325 + }, + { + "epoch": 1.76, + "grad_norm": 9.076302725867919, + "learning_rate": 4.3838744519932345e-06, + "loss": 0.5735, + "step": 10330 + }, + { + "epoch": 1.76, + "grad_norm": 11.155201996360862, + "learning_rate": 4.378966924425447e-06, + "loss": 0.5672, + "step": 10335 + }, + { + "epoch": 1.76, + "grad_norm": 18.51270787542137, + "learning_rate": 4.3740600044310664e-06, + "loss": 0.569, + "step": 10340 + }, + { + "epoch": 1.76, + "grad_norm": 14.943870912242375, + "learning_rate": 4.3691536968106675e-06, + "loss": 0.5794, + "step": 10345 + }, + { + "epoch": 1.76, + "grad_norm": 16.036753964316397, + "learning_rate": 4.364248006364222e-06, + "loss": 0.5575, + "step": 10350 + }, + { + "epoch": 1.76, + "grad_norm": 14.337387313826945, + "learning_rate": 4.359342937891099e-06, + "loss": 0.56, + "step": 10355 + }, + { + "epoch": 1.76, + "grad_norm": 12.632921011142148, + "learning_rate": 4.354438496190061e-06, + "loss": 0.5733, + "step": 10360 + }, + { + "epoch": 1.76, + "grad_norm": 7.001797244074437, + "learning_rate": 4.349534686059255e-06, + "loss": 0.5841, + "step": 10365 + }, + { + "epoch": 1.76, + "grad_norm": 9.677734308629427, + "learning_rate": 4.344631512296211e-06, + "loss": 0.5562, + "step": 10370 + }, + { + "epoch": 1.76, + "grad_norm": 6.974231493449727, + "learning_rate": 4.3397289796978335e-06, + "loss": 0.5582, + "step": 10375 + }, + { + "epoch": 1.76, + "grad_norm": 8.82580270027655, + "learning_rate": 4.334827093060406e-06, + "loss": 0.5525, + "step": 10380 + }, + { + "epoch": 1.77, + "grad_norm": 13.300464842483924, + "learning_rate": 4.329925857179573e-06, + "loss": 0.5677, + "step": 10385 + }, + { + "epoch": 1.77, + "grad_norm": 9.408626674668492, + "learning_rate": 4.325025276850347e-06, + "loss": 0.556, + "step": 10390 + }, + { + "epoch": 1.77, + "grad_norm": 6.281700862386464, + "learning_rate": 4.3201253568671e-06, + "loss": 0.5579, + "step": 10395 + }, + { + "epoch": 1.77, + "grad_norm": 6.4899340428012025, + "learning_rate": 4.3152261020235516e-06, + "loss": 0.5609, + "step": 10400 + }, + { + "epoch": 1.77, + "grad_norm": 15.976312177486976, + "learning_rate": 4.31032751711278e-06, + "loss": 0.5598, + "step": 10405 + }, + { + "epoch": 1.77, + "grad_norm": 17.343807709088885, + "learning_rate": 4.305429606927202e-06, + "loss": 0.5547, + "step": 10410 + }, + { + "epoch": 1.77, + "grad_norm": 11.69312788328246, + "learning_rate": 4.300532376258571e-06, + "loss": 0.5574, + "step": 10415 + }, + { + "epoch": 1.77, + "grad_norm": 6.87987772594223, + "learning_rate": 4.295635829897983e-06, + "loss": 0.5494, + "step": 10420 + }, + { + "epoch": 1.77, + "grad_norm": 13.887701568026129, + "learning_rate": 4.2907399726358626e-06, + "loss": 0.5559, + "step": 10425 + }, + { + "epoch": 1.77, + "grad_norm": 24.364887801382608, + "learning_rate": 4.285844809261955e-06, + "loss": 0.556, + "step": 10430 + }, + { + "epoch": 1.77, + "grad_norm": 10.924058864204879, + "learning_rate": 4.280950344565335e-06, + "loss": 0.5576, + "step": 10435 + }, + { + "epoch": 1.77, + "grad_norm": 6.91510691811666, + "learning_rate": 4.276056583334386e-06, + "loss": 0.5637, + "step": 10440 + }, + { + "epoch": 1.78, + "grad_norm": 5.736864177986743, + "learning_rate": 4.271163530356808e-06, + "loss": 0.5521, + "step": 10445 + }, + { + "epoch": 1.78, + "grad_norm": 28.15747552724219, + "learning_rate": 4.266271190419609e-06, + "loss": 0.562, + "step": 10450 + }, + { + "epoch": 1.78, + "grad_norm": 17.159897130514594, + "learning_rate": 4.261379568309093e-06, + "loss": 0.574, + "step": 10455 + }, + { + "epoch": 1.78, + "grad_norm": 5.932452219225763, + "learning_rate": 4.256488668810868e-06, + "loss": 0.5581, + "step": 10460 + }, + { + "epoch": 1.78, + "grad_norm": 13.313755617902103, + "learning_rate": 4.251598496709832e-06, + "loss": 0.5526, + "step": 10465 + }, + { + "epoch": 1.78, + "grad_norm": 18.111505571206518, + "learning_rate": 4.2467090567901735e-06, + "loss": 0.5582, + "step": 10470 + }, + { + "epoch": 1.78, + "grad_norm": 17.77722314473156, + "learning_rate": 4.241820353835363e-06, + "loss": 0.556, + "step": 10475 + }, + { + "epoch": 1.78, + "grad_norm": 14.39601648137402, + "learning_rate": 4.236932392628149e-06, + "loss": 0.5528, + "step": 10480 + }, + { + "epoch": 1.78, + "grad_norm": 24.56336728851155, + "learning_rate": 4.2320451779505575e-06, + "loss": 0.5469, + "step": 10485 + }, + { + "epoch": 1.78, + "grad_norm": 9.487147299657114, + "learning_rate": 4.227158714583884e-06, + "loss": 0.5588, + "step": 10490 + }, + { + "epoch": 1.78, + "grad_norm": 16.364733268739467, + "learning_rate": 4.222273007308684e-06, + "loss": 0.5639, + "step": 10495 + }, + { + "epoch": 1.79, + "grad_norm": 6.441684490504245, + "learning_rate": 4.217388060904778e-06, + "loss": 0.5534, + "step": 10500 + }, + { + "epoch": 1.79, + "grad_norm": 15.84464990205949, + "learning_rate": 4.21250388015124e-06, + "loss": 0.5473, + "step": 10505 + }, + { + "epoch": 1.79, + "grad_norm": 11.347143338237716, + "learning_rate": 4.207620469826397e-06, + "loss": 0.5563, + "step": 10510 + }, + { + "epoch": 1.79, + "grad_norm": 20.032525340936964, + "learning_rate": 4.2027378347078225e-06, + "loss": 0.541, + "step": 10515 + }, + { + "epoch": 1.79, + "grad_norm": 22.353036370097144, + "learning_rate": 4.197855979572326e-06, + "loss": 0.5461, + "step": 10520 + }, + { + "epoch": 1.79, + "grad_norm": 6.145827342584123, + "learning_rate": 4.192974909195962e-06, + "loss": 0.5538, + "step": 10525 + }, + { + "epoch": 1.79, + "grad_norm": 16.284130529239555, + "learning_rate": 4.188094628354013e-06, + "loss": 0.5523, + "step": 10530 + }, + { + "epoch": 1.79, + "grad_norm": 21.944753217572533, + "learning_rate": 4.1832151418209865e-06, + "loss": 0.5497, + "step": 10535 + }, + { + "epoch": 1.79, + "grad_norm": 10.726946766485042, + "learning_rate": 4.1783364543706165e-06, + "loss": 0.5385, + "step": 10540 + }, + { + "epoch": 1.79, + "grad_norm": 8.947396885772298, + "learning_rate": 4.173458570775856e-06, + "loss": 0.5655, + "step": 10545 + }, + { + "epoch": 1.79, + "grad_norm": 10.966857276634125, + "learning_rate": 4.1685814958088696e-06, + "loss": 0.5756, + "step": 10550 + }, + { + "epoch": 1.79, + "grad_norm": 7.402810816083218, + "learning_rate": 4.1637052342410315e-06, + "loss": 0.5623, + "step": 10555 + }, + { + "epoch": 1.8, + "grad_norm": 5.900704617475461, + "learning_rate": 4.1588297908429195e-06, + "loss": 0.54, + "step": 10560 + }, + { + "epoch": 1.8, + "grad_norm": 8.269730671746444, + "learning_rate": 4.153955170384312e-06, + "loss": 0.5483, + "step": 10565 + }, + { + "epoch": 1.8, + "grad_norm": 18.570352607344855, + "learning_rate": 4.149081377634182e-06, + "loss": 0.5409, + "step": 10570 + }, + { + "epoch": 1.8, + "grad_norm": 9.479458512559505, + "learning_rate": 4.14420841736069e-06, + "loss": 0.5487, + "step": 10575 + }, + { + "epoch": 1.8, + "grad_norm": 20.410675325402618, + "learning_rate": 4.1393362943311866e-06, + "loss": 0.5704, + "step": 10580 + }, + { + "epoch": 1.8, + "grad_norm": 8.033492396039378, + "learning_rate": 4.1344650133122e-06, + "loss": 0.5527, + "step": 10585 + }, + { + "epoch": 1.8, + "grad_norm": 8.608181918412429, + "learning_rate": 4.129594579069436e-06, + "loss": 0.5503, + "step": 10590 + }, + { + "epoch": 1.8, + "grad_norm": 16.721543496224996, + "learning_rate": 4.1247249963677725e-06, + "loss": 0.5527, + "step": 10595 + }, + { + "epoch": 1.8, + "grad_norm": 8.756527197932947, + "learning_rate": 4.119856269971254e-06, + "loss": 0.5569, + "step": 10600 + }, + { + "epoch": 1.8, + "grad_norm": 13.928787184558088, + "learning_rate": 4.114988404643086e-06, + "loss": 0.551, + "step": 10605 + }, + { + "epoch": 1.8, + "grad_norm": 27.003024941869466, + "learning_rate": 4.110121405145634e-06, + "loss": 0.5454, + "step": 10610 + }, + { + "epoch": 1.8, + "grad_norm": 24.19555970041987, + "learning_rate": 4.105255276240413e-06, + "loss": 0.5502, + "step": 10615 + }, + { + "epoch": 1.81, + "grad_norm": 6.339057342338457, + "learning_rate": 4.100390022688087e-06, + "loss": 0.5623, + "step": 10620 + }, + { + "epoch": 1.81, + "grad_norm": 11.422037220563851, + "learning_rate": 4.095525649248467e-06, + "loss": 0.544, + "step": 10625 + }, + { + "epoch": 1.81, + "grad_norm": 14.684061712602356, + "learning_rate": 4.0906621606805e-06, + "loss": 0.5611, + "step": 10630 + }, + { + "epoch": 1.81, + "grad_norm": 19.907732420246848, + "learning_rate": 4.085799561742269e-06, + "loss": 0.5619, + "step": 10635 + }, + { + "epoch": 1.81, + "grad_norm": 13.149132567592444, + "learning_rate": 4.080937857190984e-06, + "loss": 0.5567, + "step": 10640 + }, + { + "epoch": 1.81, + "grad_norm": 17.153750553470985, + "learning_rate": 4.076077051782983e-06, + "loss": 0.5652, + "step": 10645 + }, + { + "epoch": 1.81, + "grad_norm": 5.938875135207251, + "learning_rate": 4.0712171502737245e-06, + "loss": 0.5475, + "step": 10650 + }, + { + "epoch": 1.81, + "grad_norm": 28.712317930091356, + "learning_rate": 4.0663581574177764e-06, + "loss": 0.5492, + "step": 10655 + }, + { + "epoch": 1.81, + "grad_norm": 6.907991497479526, + "learning_rate": 4.061500077968829e-06, + "loss": 0.5519, + "step": 10660 + }, + { + "epoch": 1.81, + "grad_norm": 11.26578908835014, + "learning_rate": 4.056642916679666e-06, + "loss": 0.5678, + "step": 10665 + }, + { + "epoch": 1.81, + "grad_norm": 6.9969212767908955, + "learning_rate": 4.051786678302182e-06, + "loss": 0.5542, + "step": 10670 + }, + { + "epoch": 1.81, + "grad_norm": 9.811354898565146, + "learning_rate": 4.046931367587367e-06, + "loss": 0.5547, + "step": 10675 + }, + { + "epoch": 1.82, + "grad_norm": 5.351676942851948, + "learning_rate": 4.042076989285301e-06, + "loss": 0.5487, + "step": 10680 + }, + { + "epoch": 1.82, + "grad_norm": 12.690168235430562, + "learning_rate": 4.037223548145155e-06, + "loss": 0.5376, + "step": 10685 + }, + { + "epoch": 1.82, + "grad_norm": 13.10316249163135, + "learning_rate": 4.0323710489151816e-06, + "loss": 0.5542, + "step": 10690 + }, + { + "epoch": 1.82, + "grad_norm": 17.238263388190468, + "learning_rate": 4.027519496342707e-06, + "loss": 0.5477, + "step": 10695 + }, + { + "epoch": 1.82, + "grad_norm": 7.111697756767847, + "learning_rate": 4.0226688951741415e-06, + "loss": 0.541, + "step": 10700 + }, + { + "epoch": 1.82, + "grad_norm": 9.312504651719623, + "learning_rate": 4.017819250154957e-06, + "loss": 0.5625, + "step": 10705 + }, + { + "epoch": 1.82, + "grad_norm": 13.855123982730133, + "learning_rate": 4.01297056602969e-06, + "loss": 0.5456, + "step": 10710 + }, + { + "epoch": 1.82, + "grad_norm": 11.61601724706946, + "learning_rate": 4.008122847541942e-06, + "loss": 0.5484, + "step": 10715 + }, + { + "epoch": 1.82, + "grad_norm": 14.541933682803487, + "learning_rate": 4.003276099434365e-06, + "loss": 0.5408, + "step": 10720 + }, + { + "epoch": 1.82, + "grad_norm": 16.02298738803633, + "learning_rate": 3.998430326448664e-06, + "loss": 0.5623, + "step": 10725 + }, + { + "epoch": 1.82, + "grad_norm": 17.4550356670551, + "learning_rate": 3.993585533325591e-06, + "loss": 0.5351, + "step": 10730 + }, + { + "epoch": 1.83, + "grad_norm": 6.076748410436516, + "learning_rate": 3.988741724804935e-06, + "loss": 0.5471, + "step": 10735 + }, + { + "epoch": 1.83, + "grad_norm": 7.060203765038019, + "learning_rate": 3.983898905625525e-06, + "loss": 0.5606, + "step": 10740 + }, + { + "epoch": 1.83, + "grad_norm": 6.697863097937426, + "learning_rate": 3.979057080525223e-06, + "loss": 0.5733, + "step": 10745 + }, + { + "epoch": 1.83, + "grad_norm": 11.600904312186005, + "learning_rate": 3.974216254240917e-06, + "loss": 0.5571, + "step": 10750 + }, + { + "epoch": 1.83, + "grad_norm": 8.41737306070303, + "learning_rate": 3.969376431508516e-06, + "loss": 0.5434, + "step": 10755 + }, + { + "epoch": 1.83, + "grad_norm": 7.975078682574957, + "learning_rate": 3.964537617062951e-06, + "loss": 0.5439, + "step": 10760 + }, + { + "epoch": 1.83, + "grad_norm": 17.113910948663463, + "learning_rate": 3.959699815638163e-06, + "loss": 0.55, + "step": 10765 + }, + { + "epoch": 1.83, + "grad_norm": 9.726618538250287, + "learning_rate": 3.954863031967108e-06, + "loss": 0.5516, + "step": 10770 + }, + { + "epoch": 1.83, + "grad_norm": 6.149850327053818, + "learning_rate": 3.950027270781736e-06, + "loss": 0.5534, + "step": 10775 + }, + { + "epoch": 1.83, + "grad_norm": 6.751245969017911, + "learning_rate": 3.945192536813006e-06, + "loss": 0.5286, + "step": 10780 + }, + { + "epoch": 1.83, + "grad_norm": 9.60655821899745, + "learning_rate": 3.940358834790867e-06, + "loss": 0.5523, + "step": 10785 + }, + { + "epoch": 1.83, + "grad_norm": 5.613630439695627, + "learning_rate": 3.935526169444261e-06, + "loss": 0.5413, + "step": 10790 + }, + { + "epoch": 1.84, + "grad_norm": 7.455914843357684, + "learning_rate": 3.930694545501117e-06, + "loss": 0.5377, + "step": 10795 + }, + { + "epoch": 1.84, + "grad_norm": 8.107483270728077, + "learning_rate": 3.925863967688339e-06, + "loss": 0.5428, + "step": 10800 + }, + { + "epoch": 1.84, + "grad_norm": 24.49841838362915, + "learning_rate": 3.921034440731813e-06, + "loss": 0.5484, + "step": 10805 + }, + { + "epoch": 1.84, + "grad_norm": 9.02938698091131, + "learning_rate": 3.916205969356399e-06, + "loss": 0.5291, + "step": 10810 + }, + { + "epoch": 1.84, + "grad_norm": 11.780566938233218, + "learning_rate": 3.911378558285915e-06, + "loss": 0.5498, + "step": 10815 + }, + { + "epoch": 1.84, + "grad_norm": 10.707041202570831, + "learning_rate": 3.906552212243151e-06, + "loss": 0.558, + "step": 10820 + }, + { + "epoch": 1.84, + "grad_norm": 12.845285542740227, + "learning_rate": 3.90172693594985e-06, + "loss": 0.54, + "step": 10825 + }, + { + "epoch": 1.84, + "grad_norm": 6.30898838434219, + "learning_rate": 3.89690273412671e-06, + "loss": 0.5421, + "step": 10830 + }, + { + "epoch": 1.84, + "grad_norm": 6.456838345718717, + "learning_rate": 3.892079611493379e-06, + "loss": 0.5268, + "step": 10835 + }, + { + "epoch": 1.84, + "grad_norm": 6.483645969550005, + "learning_rate": 3.8872575727684485e-06, + "loss": 0.5467, + "step": 10840 + }, + { + "epoch": 1.84, + "grad_norm": 9.01162771308466, + "learning_rate": 3.882436622669447e-06, + "loss": 0.5395, + "step": 10845 + }, + { + "epoch": 1.84, + "grad_norm": 6.646301598055061, + "learning_rate": 3.877616765912843e-06, + "loss": 0.5301, + "step": 10850 + }, + { + "epoch": 1.85, + "grad_norm": 11.206351159952076, + "learning_rate": 3.872798007214028e-06, + "loss": 0.5457, + "step": 10855 + }, + { + "epoch": 1.85, + "grad_norm": 6.460428079828267, + "learning_rate": 3.867980351287326e-06, + "loss": 0.5459, + "step": 10860 + }, + { + "epoch": 1.85, + "grad_norm": 9.058509320670087, + "learning_rate": 3.863163802845979e-06, + "loss": 0.5513, + "step": 10865 + }, + { + "epoch": 1.85, + "grad_norm": 8.923932413895969, + "learning_rate": 3.858348366602147e-06, + "loss": 0.5312, + "step": 10870 + }, + { + "epoch": 1.85, + "grad_norm": 6.311599242077515, + "learning_rate": 3.853534047266902e-06, + "loss": 0.5456, + "step": 10875 + }, + { + "epoch": 1.85, + "grad_norm": 17.53201788074447, + "learning_rate": 3.848720849550221e-06, + "loss": 0.5415, + "step": 10880 + }, + { + "epoch": 1.85, + "grad_norm": 6.801911731802575, + "learning_rate": 3.843908778160986e-06, + "loss": 0.5391, + "step": 10885 + }, + { + "epoch": 1.85, + "grad_norm": 8.338680249038083, + "learning_rate": 3.839097837806977e-06, + "loss": 0.5482, + "step": 10890 + }, + { + "epoch": 1.85, + "grad_norm": 6.535259979212849, + "learning_rate": 3.834288033194864e-06, + "loss": 0.546, + "step": 10895 + }, + { + "epoch": 1.85, + "grad_norm": 15.935828201938994, + "learning_rate": 3.829479369030211e-06, + "loss": 0.5377, + "step": 10900 + }, + { + "epoch": 1.85, + "grad_norm": 10.688589701889827, + "learning_rate": 3.824671850017462e-06, + "loss": 0.5372, + "step": 10905 + }, + { + "epoch": 1.85, + "grad_norm": 31.066244041899697, + "learning_rate": 3.819865480859943e-06, + "loss": 0.5361, + "step": 10910 + }, + { + "epoch": 1.86, + "grad_norm": 15.014970505246703, + "learning_rate": 3.815060266259856e-06, + "loss": 0.5493, + "step": 10915 + }, + { + "epoch": 1.86, + "grad_norm": 5.526054798739284, + "learning_rate": 3.8102562109182713e-06, + "loss": 0.543, + "step": 10920 + }, + { + "epoch": 1.86, + "grad_norm": 13.449964955547665, + "learning_rate": 3.805453319535126e-06, + "loss": 0.5313, + "step": 10925 + }, + { + "epoch": 1.86, + "grad_norm": 17.86887071360535, + "learning_rate": 3.8006515968092176e-06, + "loss": 0.5498, + "step": 10930 + }, + { + "epoch": 1.86, + "grad_norm": 5.578980091545276, + "learning_rate": 3.7958510474382027e-06, + "loss": 0.5485, + "step": 10935 + }, + { + "epoch": 1.86, + "grad_norm": 6.05700161143195, + "learning_rate": 3.7910516761185864e-06, + "loss": 0.5416, + "step": 10940 + }, + { + "epoch": 1.86, + "grad_norm": 5.476537423347814, + "learning_rate": 3.7862534875457226e-06, + "loss": 0.539, + "step": 10945 + }, + { + "epoch": 1.86, + "grad_norm": 12.67141676921221, + "learning_rate": 3.781456486413809e-06, + "loss": 0.5176, + "step": 10950 + }, + { + "epoch": 1.86, + "grad_norm": 11.665045618301248, + "learning_rate": 3.7766606774158828e-06, + "loss": 0.5338, + "step": 10955 + }, + { + "epoch": 1.86, + "grad_norm": 8.702942638630219, + "learning_rate": 3.7718660652438115e-06, + "loss": 0.5382, + "step": 10960 + }, + { + "epoch": 1.86, + "grad_norm": 6.780251465298803, + "learning_rate": 3.7670726545882945e-06, + "loss": 0.5342, + "step": 10965 + }, + { + "epoch": 1.87, + "grad_norm": 5.284822222671517, + "learning_rate": 3.7622804501388554e-06, + "loss": 0.5461, + "step": 10970 + }, + { + "epoch": 1.87, + "grad_norm": 7.327317512329649, + "learning_rate": 3.7574894565838364e-06, + "loss": 0.5494, + "step": 10975 + }, + { + "epoch": 1.87, + "grad_norm": 8.938292463573944, + "learning_rate": 3.752699678610395e-06, + "loss": 0.5337, + "step": 10980 + }, + { + "epoch": 1.87, + "grad_norm": 6.505501291362319, + "learning_rate": 3.747911120904501e-06, + "loss": 0.5291, + "step": 10985 + }, + { + "epoch": 1.87, + "grad_norm": 6.4303775627203015, + "learning_rate": 3.7431237881509287e-06, + "loss": 0.5246, + "step": 10990 + }, + { + "epoch": 1.87, + "grad_norm": 11.991768485557685, + "learning_rate": 3.7383376850332546e-06, + "loss": 0.5243, + "step": 10995 + }, + { + "epoch": 1.87, + "grad_norm": 5.495221718797736, + "learning_rate": 3.733552816233854e-06, + "loss": 0.5337, + "step": 11000 + }, + { + "epoch": 1.87, + "grad_norm": 5.162826956882016, + "learning_rate": 3.7287691864338926e-06, + "loss": 0.5384, + "step": 11005 + }, + { + "epoch": 1.87, + "grad_norm": 5.199569881998118, + "learning_rate": 3.723986800313324e-06, + "loss": 0.5195, + "step": 11010 + }, + { + "epoch": 1.87, + "grad_norm": 9.756550802893384, + "learning_rate": 3.7192056625508877e-06, + "loss": 0.539, + "step": 11015 + }, + { + "epoch": 1.87, + "grad_norm": 6.743213887283496, + "learning_rate": 3.7144257778240955e-06, + "loss": 0.5365, + "step": 11020 + }, + { + "epoch": 1.87, + "grad_norm": 11.84475921976795, + "learning_rate": 3.70964715080924e-06, + "loss": 0.5423, + "step": 11025 + }, + { + "epoch": 1.88, + "grad_norm": 8.338435642943008, + "learning_rate": 3.704869786181382e-06, + "loss": 0.5352, + "step": 11030 + }, + { + "epoch": 1.88, + "grad_norm": 5.387321434056955, + "learning_rate": 3.700093688614344e-06, + "loss": 0.5361, + "step": 11035 + }, + { + "epoch": 1.88, + "grad_norm": 5.6140088905394885, + "learning_rate": 3.695318862780712e-06, + "loss": 0.5336, + "step": 11040 + }, + { + "epoch": 1.88, + "grad_norm": 5.222058289128649, + "learning_rate": 3.6905453133518266e-06, + "loss": 0.5342, + "step": 11045 + }, + { + "epoch": 1.88, + "grad_norm": 9.083708298443856, + "learning_rate": 3.6857730449977807e-06, + "loss": 0.532, + "step": 11050 + }, + { + "epoch": 1.88, + "grad_norm": 10.372511390623997, + "learning_rate": 3.6810020623874143e-06, + "loss": 0.5339, + "step": 11055 + }, + { + "epoch": 1.88, + "grad_norm": 7.828966228991336, + "learning_rate": 3.676232370188305e-06, + "loss": 0.5449, + "step": 11060 + }, + { + "epoch": 1.88, + "grad_norm": 5.061506476661475, + "learning_rate": 3.6714639730667733e-06, + "loss": 0.5439, + "step": 11065 + }, + { + "epoch": 1.88, + "grad_norm": 6.986951778483274, + "learning_rate": 3.6666968756878706e-06, + "loss": 0.5391, + "step": 11070 + }, + { + "epoch": 1.88, + "grad_norm": 6.697073681510214, + "learning_rate": 3.6619310827153777e-06, + "loss": 0.5255, + "step": 11075 + }, + { + "epoch": 1.88, + "grad_norm": 7.48917727035367, + "learning_rate": 3.6571665988117964e-06, + "loss": 0.5353, + "step": 11080 + }, + { + "epoch": 1.88, + "grad_norm": 14.95409608824707, + "learning_rate": 3.6524034286383512e-06, + "loss": 0.5388, + "step": 11085 + }, + { + "epoch": 1.89, + "grad_norm": 8.739997106110412, + "learning_rate": 3.647641576854979e-06, + "loss": 0.5397, + "step": 11090 + }, + { + "epoch": 1.89, + "grad_norm": 5.702913793269424, + "learning_rate": 3.6428810481203314e-06, + "loss": 0.5392, + "step": 11095 + }, + { + "epoch": 1.89, + "grad_norm": 10.152523882009815, + "learning_rate": 3.6381218470917566e-06, + "loss": 0.5367, + "step": 11100 + }, + { + "epoch": 1.89, + "grad_norm": 14.924401718875243, + "learning_rate": 3.6333639784253116e-06, + "loss": 0.5328, + "step": 11105 + }, + { + "epoch": 1.89, + "grad_norm": 11.220084367718904, + "learning_rate": 3.6286074467757488e-06, + "loss": 0.5378, + "step": 11110 + }, + { + "epoch": 1.89, + "grad_norm": 7.6355010730653685, + "learning_rate": 3.623852256796511e-06, + "loss": 0.5417, + "step": 11115 + }, + { + "epoch": 1.89, + "grad_norm": 16.697080845793394, + "learning_rate": 3.6190984131397277e-06, + "loss": 0.5458, + "step": 11120 + }, + { + "epoch": 1.89, + "grad_norm": 7.4948154448340105, + "learning_rate": 3.6143459204562128e-06, + "loss": 0.529, + "step": 11125 + }, + { + "epoch": 1.89, + "grad_norm": 16.98852882390966, + "learning_rate": 3.609594783395458e-06, + "loss": 0.5299, + "step": 11130 + }, + { + "epoch": 1.89, + "grad_norm": 5.254750013758175, + "learning_rate": 3.604845006605632e-06, + "loss": 0.5268, + "step": 11135 + }, + { + "epoch": 1.89, + "grad_norm": 9.746954504175433, + "learning_rate": 3.600096594733564e-06, + "loss": 0.5483, + "step": 11140 + }, + { + "epoch": 1.89, + "grad_norm": 7.365857124321258, + "learning_rate": 3.5953495524247573e-06, + "loss": 0.5281, + "step": 11145 + }, + { + "epoch": 1.9, + "grad_norm": 12.329693543819747, + "learning_rate": 3.5906038843233693e-06, + "loss": 0.5374, + "step": 11150 + }, + { + "epoch": 1.9, + "grad_norm": 5.801535909625722, + "learning_rate": 3.585859595072216e-06, + "loss": 0.5276, + "step": 11155 + }, + { + "epoch": 1.9, + "grad_norm": 12.397119396349337, + "learning_rate": 3.5811166893127646e-06, + "loss": 0.5316, + "step": 11160 + }, + { + "epoch": 1.9, + "grad_norm": 21.842813538079348, + "learning_rate": 3.576375171685126e-06, + "loss": 0.5408, + "step": 11165 + }, + { + "epoch": 1.9, + "grad_norm": 12.231416398872692, + "learning_rate": 3.5716350468280553e-06, + "loss": 0.5301, + "step": 11170 + }, + { + "epoch": 1.9, + "grad_norm": 5.519093114896531, + "learning_rate": 3.566896319378947e-06, + "loss": 0.5304, + "step": 11175 + }, + { + "epoch": 1.9, + "grad_norm": 5.940488754631659, + "learning_rate": 3.562158993973821e-06, + "loss": 0.5465, + "step": 11180 + }, + { + "epoch": 1.9, + "grad_norm": 5.299602282158513, + "learning_rate": 3.5574230752473336e-06, + "loss": 0.5242, + "step": 11185 + }, + { + "epoch": 1.9, + "grad_norm": 5.554576788811912, + "learning_rate": 3.5526885678327617e-06, + "loss": 0.5384, + "step": 11190 + }, + { + "epoch": 1.9, + "grad_norm": 6.904279904285887, + "learning_rate": 3.5479554763620016e-06, + "loss": 0.5238, + "step": 11195 + }, + { + "epoch": 1.9, + "grad_norm": 9.20256437712205, + "learning_rate": 3.5432238054655633e-06, + "loss": 0.5232, + "step": 11200 + }, + { + "epoch": 1.9, + "grad_norm": 8.122344904169903, + "learning_rate": 3.53849355977257e-06, + "loss": 0.5497, + "step": 11205 + }, + { + "epoch": 1.91, + "grad_norm": 7.152097367658841, + "learning_rate": 3.533764743910747e-06, + "loss": 0.5314, + "step": 11210 + }, + { + "epoch": 1.91, + "grad_norm": 7.84948937139005, + "learning_rate": 3.529037362506424e-06, + "loss": 0.537, + "step": 11215 + }, + { + "epoch": 1.91, + "grad_norm": 5.577212182155189, + "learning_rate": 3.5243114201845242e-06, + "loss": 0.5112, + "step": 11220 + }, + { + "epoch": 1.91, + "grad_norm": 6.170134880436668, + "learning_rate": 3.519586921568564e-06, + "loss": 0.5338, + "step": 11225 + }, + { + "epoch": 1.91, + "grad_norm": 8.220419192071601, + "learning_rate": 3.5148638712806486e-06, + "loss": 0.5082, + "step": 11230 + }, + { + "epoch": 1.91, + "grad_norm": 6.7340417910071455, + "learning_rate": 3.5101422739414657e-06, + "loss": 0.5183, + "step": 11235 + }, + { + "epoch": 1.91, + "grad_norm": 6.07802322303307, + "learning_rate": 3.5054221341702815e-06, + "loss": 0.5209, + "step": 11240 + }, + { + "epoch": 1.91, + "grad_norm": 10.700269535527903, + "learning_rate": 3.500703456584935e-06, + "loss": 0.5133, + "step": 11245 + }, + { + "epoch": 1.91, + "grad_norm": 19.52570025614485, + "learning_rate": 3.495986245801839e-06, + "loss": 0.5262, + "step": 11250 + }, + { + "epoch": 1.91, + "grad_norm": 18.298198955013202, + "learning_rate": 3.4912705064359643e-06, + "loss": 0.5205, + "step": 11255 + }, + { + "epoch": 1.91, + "grad_norm": 10.119842025529667, + "learning_rate": 3.486556243100847e-06, + "loss": 0.5261, + "step": 11260 + }, + { + "epoch": 1.92, + "grad_norm": 8.108706346417808, + "learning_rate": 3.481843460408579e-06, + "loss": 0.5328, + "step": 11265 + }, + { + "epoch": 1.92, + "grad_norm": 6.859265643783456, + "learning_rate": 3.4771321629698008e-06, + "loss": 0.5452, + "step": 11270 + }, + { + "epoch": 1.92, + "grad_norm": 8.659912383492827, + "learning_rate": 3.472422355393703e-06, + "loss": 0.5287, + "step": 11275 + }, + { + "epoch": 1.92, + "grad_norm": 6.979094507974204, + "learning_rate": 3.4677140422880172e-06, + "loss": 0.5408, + "step": 11280 + }, + { + "epoch": 1.92, + "grad_norm": 6.489033753103027, + "learning_rate": 3.4630072282590135e-06, + "loss": 0.5448, + "step": 11285 + }, + { + "epoch": 1.92, + "grad_norm": 6.576397068629151, + "learning_rate": 3.4583019179114948e-06, + "loss": 0.5268, + "step": 11290 + }, + { + "epoch": 1.92, + "grad_norm": 7.561886265766148, + "learning_rate": 3.453598115848795e-06, + "loss": 0.5308, + "step": 11295 + }, + { + "epoch": 1.92, + "grad_norm": 6.515198041434377, + "learning_rate": 3.448895826672767e-06, + "loss": 0.5279, + "step": 11300 + }, + { + "epoch": 1.92, + "grad_norm": 6.760420913784969, + "learning_rate": 3.444195054983788e-06, + "loss": 0.5204, + "step": 11305 + }, + { + "epoch": 1.92, + "grad_norm": 20.619225924917643, + "learning_rate": 3.439495805380752e-06, + "loss": 0.5316, + "step": 11310 + }, + { + "epoch": 1.92, + "grad_norm": 9.939695464936968, + "learning_rate": 3.4347980824610593e-06, + "loss": 0.5282, + "step": 11315 + }, + { + "epoch": 1.92, + "grad_norm": 8.492607689047357, + "learning_rate": 3.4301018908206198e-06, + "loss": 0.5282, + "step": 11320 + }, + { + "epoch": 1.93, + "grad_norm": 9.5726352155456, + "learning_rate": 3.4254072350538437e-06, + "loss": 0.5154, + "step": 11325 + }, + { + "epoch": 1.93, + "grad_norm": 7.521845531151163, + "learning_rate": 3.420714119753641e-06, + "loss": 0.5391, + "step": 11330 + }, + { + "epoch": 1.93, + "grad_norm": 6.394116452417119, + "learning_rate": 3.4160225495114134e-06, + "loss": 0.541, + "step": 11335 + }, + { + "epoch": 1.93, + "grad_norm": 11.445030342660791, + "learning_rate": 3.4113325289170475e-06, + "loss": 0.5275, + "step": 11340 + }, + { + "epoch": 1.93, + "grad_norm": 18.940504419644416, + "learning_rate": 3.4066440625589186e-06, + "loss": 0.5179, + "step": 11345 + }, + { + "epoch": 1.93, + "grad_norm": 5.6345428352537965, + "learning_rate": 3.4019571550238816e-06, + "loss": 0.5142, + "step": 11350 + }, + { + "epoch": 1.93, + "grad_norm": 9.461369428184476, + "learning_rate": 3.3972718108972612e-06, + "loss": 0.5224, + "step": 11355 + }, + { + "epoch": 1.93, + "grad_norm": 29.354729919652407, + "learning_rate": 3.3925880347628577e-06, + "loss": 0.5264, + "step": 11360 + }, + { + "epoch": 1.93, + "grad_norm": 14.403518167501197, + "learning_rate": 3.3879058312029354e-06, + "loss": 0.5282, + "step": 11365 + }, + { + "epoch": 1.93, + "grad_norm": 5.988468122093689, + "learning_rate": 3.3832252047982206e-06, + "loss": 0.5311, + "step": 11370 + }, + { + "epoch": 1.93, + "grad_norm": 6.384055884376355, + "learning_rate": 3.378546160127899e-06, + "loss": 0.5235, + "step": 11375 + }, + { + "epoch": 1.93, + "grad_norm": 5.98755000776099, + "learning_rate": 3.3738687017696004e-06, + "loss": 0.5422, + "step": 11380 + }, + { + "epoch": 1.94, + "grad_norm": 5.865728926427982, + "learning_rate": 3.3691928342994117e-06, + "loss": 0.5299, + "step": 11385 + }, + { + "epoch": 1.94, + "grad_norm": 7.669535823097867, + "learning_rate": 3.364518562291861e-06, + "loss": 0.5216, + "step": 11390 + }, + { + "epoch": 1.94, + "grad_norm": 20.042979966523028, + "learning_rate": 3.359845890319914e-06, + "loss": 0.5181, + "step": 11395 + }, + { + "epoch": 1.94, + "grad_norm": 7.242460329586381, + "learning_rate": 3.3551748229549695e-06, + "loss": 0.5314, + "step": 11400 + }, + { + "epoch": 1.94, + "grad_norm": 6.409967046664902, + "learning_rate": 3.3505053647668616e-06, + "loss": 0.5403, + "step": 11405 + }, + { + "epoch": 1.94, + "grad_norm": 15.280372712285226, + "learning_rate": 3.3458375203238456e-06, + "loss": 0.5252, + "step": 11410 + }, + { + "epoch": 1.94, + "grad_norm": 19.565929616107933, + "learning_rate": 3.3411712941926027e-06, + "loss": 0.5301, + "step": 11415 + }, + { + "epoch": 1.94, + "grad_norm": 7.77112520924579, + "learning_rate": 3.3365066909382233e-06, + "loss": 0.5281, + "step": 11420 + }, + { + "epoch": 1.94, + "grad_norm": 8.77064494485486, + "learning_rate": 3.331843715124216e-06, + "loss": 0.5188, + "step": 11425 + }, + { + "epoch": 1.94, + "grad_norm": 5.3463404607000555, + "learning_rate": 3.3271823713124973e-06, + "loss": 0.5279, + "step": 11430 + }, + { + "epoch": 1.94, + "grad_norm": 7.202088692534696, + "learning_rate": 3.3225226640633835e-06, + "loss": 0.5118, + "step": 11435 + }, + { + "epoch": 1.94, + "grad_norm": 5.539992983116485, + "learning_rate": 3.317864597935595e-06, + "loss": 0.5225, + "step": 11440 + }, + { + "epoch": 1.95, + "grad_norm": 9.612631868756232, + "learning_rate": 3.3132081774862403e-06, + "loss": 0.527, + "step": 11445 + }, + { + "epoch": 1.95, + "grad_norm": 7.363407207718291, + "learning_rate": 3.308553407270822e-06, + "loss": 0.5433, + "step": 11450 + }, + { + "epoch": 1.95, + "grad_norm": 5.753353395071274, + "learning_rate": 3.30390029184323e-06, + "loss": 0.5056, + "step": 11455 + }, + { + "epoch": 1.95, + "grad_norm": 6.972212324066545, + "learning_rate": 3.299248835755728e-06, + "loss": 0.5234, + "step": 11460 + }, + { + "epoch": 1.95, + "grad_norm": 5.2989976391151785, + "learning_rate": 3.2945990435589636e-06, + "loss": 0.5205, + "step": 11465 + }, + { + "epoch": 1.95, + "grad_norm": 14.382665579425526, + "learning_rate": 3.289950919801954e-06, + "loss": 0.5344, + "step": 11470 + }, + { + "epoch": 1.95, + "grad_norm": 5.614165948682854, + "learning_rate": 3.2853044690320836e-06, + "loss": 0.5335, + "step": 11475 + }, + { + "epoch": 1.95, + "grad_norm": 18.54799855202085, + "learning_rate": 3.2806596957951003e-06, + "loss": 0.518, + "step": 11480 + }, + { + "epoch": 1.95, + "grad_norm": 11.260869030494261, + "learning_rate": 3.2760166046351127e-06, + "loss": 0.5177, + "step": 11485 + }, + { + "epoch": 1.95, + "grad_norm": 8.396344032296104, + "learning_rate": 3.2713752000945792e-06, + "loss": 0.5346, + "step": 11490 + }, + { + "epoch": 1.95, + "grad_norm": 23.880420866969953, + "learning_rate": 3.266735486714314e-06, + "loss": 0.5279, + "step": 11495 + }, + { + "epoch": 1.96, + "grad_norm": 22.70728514331903, + "learning_rate": 3.2620974690334723e-06, + "loss": 0.5239, + "step": 11500 + }, + { + "epoch": 1.96, + "grad_norm": 16.493266956279886, + "learning_rate": 3.257461151589551e-06, + "loss": 0.5259, + "step": 11505 + }, + { + "epoch": 1.96, + "grad_norm": 22.029387258063146, + "learning_rate": 3.2528265389183857e-06, + "loss": 0.5322, + "step": 11510 + }, + { + "epoch": 1.96, + "grad_norm": 28.678050417980476, + "learning_rate": 3.2481936355541425e-06, + "loss": 0.5215, + "step": 11515 + }, + { + "epoch": 1.96, + "grad_norm": 12.50147938948209, + "learning_rate": 3.2435624460293163e-06, + "loss": 0.5234, + "step": 11520 + }, + { + "epoch": 1.96, + "grad_norm": 14.973628043093376, + "learning_rate": 3.2389329748747246e-06, + "loss": 0.5102, + "step": 11525 + }, + { + "epoch": 1.96, + "grad_norm": 9.999565283031597, + "learning_rate": 3.2343052266195044e-06, + "loss": 0.5101, + "step": 11530 + }, + { + "epoch": 1.96, + "grad_norm": 5.374175195674296, + "learning_rate": 3.2296792057911064e-06, + "loss": 0.5122, + "step": 11535 + }, + { + "epoch": 1.96, + "grad_norm": 6.887472400804496, + "learning_rate": 3.22505491691529e-06, + "loss": 0.5198, + "step": 11540 + }, + { + "epoch": 1.96, + "grad_norm": 11.888601482195552, + "learning_rate": 3.220432364516124e-06, + "loss": 0.5243, + "step": 11545 + }, + { + "epoch": 1.96, + "grad_norm": 5.117889813946124, + "learning_rate": 3.215811553115974e-06, + "loss": 0.517, + "step": 11550 + }, + { + "epoch": 1.96, + "grad_norm": 6.489191260544375, + "learning_rate": 3.2111924872355055e-06, + "loss": 0.5341, + "step": 11555 + }, + { + "epoch": 1.97, + "grad_norm": 10.108146934501852, + "learning_rate": 3.2065751713936757e-06, + "loss": 0.5316, + "step": 11560 + }, + { + "epoch": 1.97, + "grad_norm": 8.939846904829047, + "learning_rate": 3.2019596101077276e-06, + "loss": 0.5276, + "step": 11565 + }, + { + "epoch": 1.97, + "grad_norm": 5.798886561943347, + "learning_rate": 3.197345807893191e-06, + "loss": 0.5185, + "step": 11570 + }, + { + "epoch": 1.97, + "grad_norm": 5.562157928123866, + "learning_rate": 3.192733769263874e-06, + "loss": 0.5167, + "step": 11575 + }, + { + "epoch": 1.97, + "grad_norm": 8.255025145423334, + "learning_rate": 3.1881234987318554e-06, + "loss": 0.5287, + "step": 11580 + }, + { + "epoch": 1.97, + "grad_norm": 9.00519837851638, + "learning_rate": 3.183515000807488e-06, + "loss": 0.5323, + "step": 11585 + }, + { + "epoch": 1.97, + "grad_norm": 10.33740503879071, + "learning_rate": 3.178908279999392e-06, + "loss": 0.5257, + "step": 11590 + }, + { + "epoch": 1.97, + "grad_norm": 8.941774823430713, + "learning_rate": 3.174303340814443e-06, + "loss": 0.5405, + "step": 11595 + }, + { + "epoch": 1.97, + "grad_norm": 5.764515099338116, + "learning_rate": 3.16970018775778e-06, + "loss": 0.5194, + "step": 11600 + }, + { + "epoch": 1.97, + "grad_norm": 7.769381791545381, + "learning_rate": 3.1650988253327906e-06, + "loss": 0.5261, + "step": 11605 + }, + { + "epoch": 1.97, + "grad_norm": 5.430163930611821, + "learning_rate": 3.160499258041112e-06, + "loss": 0.5204, + "step": 11610 + }, + { + "epoch": 1.97, + "grad_norm": 5.8678313815022545, + "learning_rate": 3.1559014903826245e-06, + "loss": 0.5267, + "step": 11615 + }, + { + "epoch": 1.98, + "grad_norm": 6.961306154968712, + "learning_rate": 3.1513055268554518e-06, + "loss": 0.5123, + "step": 11620 + }, + { + "epoch": 1.98, + "grad_norm": 6.386787947968249, + "learning_rate": 3.146711371955943e-06, + "loss": 0.5293, + "step": 11625 + }, + { + "epoch": 1.98, + "grad_norm": 13.380022599514302, + "learning_rate": 3.142119030178688e-06, + "loss": 0.519, + "step": 11630 + }, + { + "epoch": 1.98, + "grad_norm": 15.601956854275004, + "learning_rate": 3.1375285060164963e-06, + "loss": 0.5218, + "step": 11635 + }, + { + "epoch": 1.98, + "grad_norm": 9.956209340415413, + "learning_rate": 3.132939803960402e-06, + "loss": 0.5152, + "step": 11640 + }, + { + "epoch": 1.98, + "grad_norm": 14.5091717789471, + "learning_rate": 3.128352928499657e-06, + "loss": 0.5064, + "step": 11645 + }, + { + "epoch": 1.98, + "grad_norm": 27.95353269895987, + "learning_rate": 3.123767884121725e-06, + "loss": 0.5348, + "step": 11650 + }, + { + "epoch": 1.98, + "grad_norm": 16.97543978293298, + "learning_rate": 3.1191846753122783e-06, + "loss": 0.5094, + "step": 11655 + }, + { + "epoch": 1.98, + "grad_norm": 14.329302870828528, + "learning_rate": 3.1146033065551964e-06, + "loss": 0.5126, + "step": 11660 + }, + { + "epoch": 1.98, + "grad_norm": 26.946136861360525, + "learning_rate": 3.11002378233255e-06, + "loss": 0.5222, + "step": 11665 + }, + { + "epoch": 1.98, + "grad_norm": 26.85683438822377, + "learning_rate": 3.1054461071246155e-06, + "loss": 0.5176, + "step": 11670 + }, + { + "epoch": 1.98, + "grad_norm": 32.54303271050792, + "learning_rate": 3.100870285409856e-06, + "loss": 0.5285, + "step": 11675 + }, + { + "epoch": 1.99, + "grad_norm": 22.70343510365221, + "learning_rate": 3.0962963216649196e-06, + "loss": 0.5281, + "step": 11680 + }, + { + "epoch": 1.99, + "grad_norm": 13.917301804129767, + "learning_rate": 3.0917242203646385e-06, + "loss": 0.5043, + "step": 11685 + }, + { + "epoch": 1.99, + "grad_norm": 5.110336385901124, + "learning_rate": 3.087153985982024e-06, + "loss": 0.5175, + "step": 11690 + }, + { + "epoch": 1.99, + "grad_norm": 5.677973496169243, + "learning_rate": 3.0825856229882584e-06, + "loss": 0.5124, + "step": 11695 + }, + { + "epoch": 1.99, + "grad_norm": 6.415213598801499, + "learning_rate": 3.078019135852698e-06, + "loss": 0.5211, + "step": 11700 + }, + { + "epoch": 1.99, + "grad_norm": 16.720758651500194, + "learning_rate": 3.073454529042854e-06, + "loss": 0.5154, + "step": 11705 + }, + { + "epoch": 1.99, + "grad_norm": 5.50117824566899, + "learning_rate": 3.068891807024409e-06, + "loss": 0.52, + "step": 11710 + }, + { + "epoch": 1.99, + "grad_norm": 5.441597888328433, + "learning_rate": 3.064330974261196e-06, + "loss": 0.5222, + "step": 11715 + }, + { + "epoch": 1.99, + "grad_norm": 12.022956215814487, + "learning_rate": 3.0597720352152004e-06, + "loss": 0.5113, + "step": 11720 + }, + { + "epoch": 1.99, + "grad_norm": 11.45410754613074, + "learning_rate": 3.0552149943465554e-06, + "loss": 0.5313, + "step": 11725 + }, + { + "epoch": 1.99, + "grad_norm": 9.656188542011064, + "learning_rate": 3.0506598561135362e-06, + "loss": 0.5154, + "step": 11730 + }, + { + "epoch": 2.0, + "grad_norm": 6.627915183745318, + "learning_rate": 3.0461066249725584e-06, + "loss": 0.5188, + "step": 11735 + }, + { + "epoch": 2.0, + "grad_norm": 5.616229006704577, + "learning_rate": 3.0415553053781725e-06, + "loss": 0.5208, + "step": 11740 + }, + { + "epoch": 2.0, + "grad_norm": 7.0943460014009, + "learning_rate": 3.037005901783053e-06, + "loss": 0.5194, + "step": 11745 + }, + { + "epoch": 2.0, + "grad_norm": 7.190068268493544, + "learning_rate": 3.032458418638008e-06, + "loss": 0.5208, + "step": 11750 + }, + { + "epoch": 2.0, + "grad_norm": 5.1286362517199136, + "learning_rate": 3.0279128603919593e-06, + "loss": 0.5125, + "step": 11755 + }, + { + "epoch": 2.0, + "grad_norm": 6.220220298533826, + "learning_rate": 3.0233692314919525e-06, + "loss": 0.5235, + "step": 11760 + }, + { + "epoch": 2.0, + "eval_loss": 0.3905054032802582, + "eval_runtime": 75.0128, + "eval_samples_per_second": 4.826, + "eval_steps_per_second": 0.613, + "step": 11764 + }, + { + "epoch": 2.0, + "grad_norm": 8.611507507714085, + "learning_rate": 3.018827536383142e-06, + "loss": 0.4829, + "step": 11765 + }, + { + "epoch": 2.0, + "grad_norm": 5.752742447036074, + "learning_rate": 3.0142877795087876e-06, + "loss": 0.3723, + "step": 11770 + }, + { + "epoch": 2.0, + "grad_norm": 8.804207868987879, + "learning_rate": 3.009749965310259e-06, + "loss": 0.3751, + "step": 11775 + }, + { + "epoch": 2.0, + "grad_norm": 6.48577340760751, + "learning_rate": 3.0052140982270228e-06, + "loss": 0.3786, + "step": 11780 + }, + { + "epoch": 2.0, + "grad_norm": 5.96000499515076, + "learning_rate": 3.0006801826966366e-06, + "loss": 0.3715, + "step": 11785 + }, + { + "epoch": 2.0, + "grad_norm": 5.683037451115184, + "learning_rate": 2.9961482231547535e-06, + "loss": 0.3728, + "step": 11790 + }, + { + "epoch": 2.01, + "grad_norm": 4.947056755410368, + "learning_rate": 2.9916182240351132e-06, + "loss": 0.3658, + "step": 11795 + }, + { + "epoch": 2.01, + "grad_norm": 9.175381671925878, + "learning_rate": 2.987090189769535e-06, + "loss": 0.3672, + "step": 11800 + }, + { + "epoch": 2.01, + "grad_norm": 7.202043240807192, + "learning_rate": 2.9825641247879167e-06, + "loss": 0.3792, + "step": 11805 + }, + { + "epoch": 2.01, + "grad_norm": 10.919930890072514, + "learning_rate": 2.9780400335182312e-06, + "loss": 0.357, + "step": 11810 + }, + { + "epoch": 2.01, + "grad_norm": 5.458484845692553, + "learning_rate": 2.973517920386517e-06, + "loss": 0.3642, + "step": 11815 + }, + { + "epoch": 2.01, + "grad_norm": 12.605985113201717, + "learning_rate": 2.968997789816882e-06, + "loss": 0.366, + "step": 11820 + }, + { + "epoch": 2.01, + "grad_norm": 10.17730548605672, + "learning_rate": 2.9644796462314897e-06, + "loss": 0.3685, + "step": 11825 + }, + { + "epoch": 2.01, + "grad_norm": 10.988272278395259, + "learning_rate": 2.959963494050562e-06, + "loss": 0.3653, + "step": 11830 + }, + { + "epoch": 2.01, + "grad_norm": 11.730790106634236, + "learning_rate": 2.955449337692372e-06, + "loss": 0.3605, + "step": 11835 + }, + { + "epoch": 2.01, + "grad_norm": 5.492211689650247, + "learning_rate": 2.9509371815732415e-06, + "loss": 0.3645, + "step": 11840 + }, + { + "epoch": 2.01, + "grad_norm": 7.540976427148941, + "learning_rate": 2.946427030107534e-06, + "loss": 0.3642, + "step": 11845 + }, + { + "epoch": 2.01, + "grad_norm": 4.579453176000582, + "learning_rate": 2.9419188877076534e-06, + "loss": 0.3693, + "step": 11850 + }, + { + "epoch": 2.02, + "grad_norm": 5.304416580656278, + "learning_rate": 2.9374127587840373e-06, + "loss": 0.3522, + "step": 11855 + }, + { + "epoch": 2.02, + "grad_norm": 5.3593333215489, + "learning_rate": 2.932908647745152e-06, + "loss": 0.367, + "step": 11860 + }, + { + "epoch": 2.02, + "grad_norm": 12.497643274627988, + "learning_rate": 2.9284065589974915e-06, + "loss": 0.3633, + "step": 11865 + }, + { + "epoch": 2.02, + "grad_norm": 13.575720144681346, + "learning_rate": 2.9239064969455686e-06, + "loss": 0.3656, + "step": 11870 + }, + { + "epoch": 2.02, + "grad_norm": 5.418062852100978, + "learning_rate": 2.9194084659919176e-06, + "loss": 0.3624, + "step": 11875 + }, + { + "epoch": 2.02, + "grad_norm": 8.713970180416974, + "learning_rate": 2.914912470537081e-06, + "loss": 0.3605, + "step": 11880 + }, + { + "epoch": 2.02, + "grad_norm": 6.3675790522641655, + "learning_rate": 2.9104185149796166e-06, + "loss": 0.3577, + "step": 11885 + }, + { + "epoch": 2.02, + "grad_norm": 5.380851351218553, + "learning_rate": 2.9059266037160804e-06, + "loss": 0.3642, + "step": 11890 + }, + { + "epoch": 2.02, + "grad_norm": 5.881021953784742, + "learning_rate": 2.9014367411410272e-06, + "loss": 0.3689, + "step": 11895 + }, + { + "epoch": 2.02, + "grad_norm": 5.224788190005951, + "learning_rate": 2.896948931647018e-06, + "loss": 0.3706, + "step": 11900 + }, + { + "epoch": 2.02, + "grad_norm": 7.267290416239944, + "learning_rate": 2.8924631796245896e-06, + "loss": 0.3626, + "step": 11905 + }, + { + "epoch": 2.02, + "grad_norm": 5.722158276142785, + "learning_rate": 2.8879794894622794e-06, + "loss": 0.3549, + "step": 11910 + }, + { + "epoch": 2.03, + "grad_norm": 6.665683638849933, + "learning_rate": 2.883497865546599e-06, + "loss": 0.3582, + "step": 11915 + }, + { + "epoch": 2.03, + "grad_norm": 11.717660943671277, + "learning_rate": 2.8790183122620455e-06, + "loss": 0.3557, + "step": 11920 + }, + { + "epoch": 2.03, + "grad_norm": 6.744844839312123, + "learning_rate": 2.8745408339910857e-06, + "loss": 0.3629, + "step": 11925 + }, + { + "epoch": 2.03, + "grad_norm": 5.254584277873432, + "learning_rate": 2.8700654351141546e-06, + "loss": 0.3626, + "step": 11930 + }, + { + "epoch": 2.03, + "grad_norm": 5.076849011568416, + "learning_rate": 2.865592120009659e-06, + "loss": 0.3665, + "step": 11935 + }, + { + "epoch": 2.03, + "grad_norm": 9.89105922641013, + "learning_rate": 2.8611208930539635e-06, + "loss": 0.3748, + "step": 11940 + }, + { + "epoch": 2.03, + "grad_norm": 8.278339030031562, + "learning_rate": 2.8566517586213895e-06, + "loss": 0.3752, + "step": 11945 + }, + { + "epoch": 2.03, + "grad_norm": 5.055266207685965, + "learning_rate": 2.852184721084208e-06, + "loss": 0.3645, + "step": 11950 + }, + { + "epoch": 2.03, + "grad_norm": 7.148044998103296, + "learning_rate": 2.847719784812648e-06, + "loss": 0.3722, + "step": 11955 + }, + { + "epoch": 2.03, + "grad_norm": 8.54289037896895, + "learning_rate": 2.8432569541748728e-06, + "loss": 0.3609, + "step": 11960 + }, + { + "epoch": 2.03, + "grad_norm": 8.931371433064498, + "learning_rate": 2.8387962335369935e-06, + "loss": 0.3658, + "step": 11965 + }, + { + "epoch": 2.04, + "grad_norm": 13.930370778361347, + "learning_rate": 2.8343376272630524e-06, + "loss": 0.3487, + "step": 11970 + }, + { + "epoch": 2.04, + "grad_norm": 10.26844641650417, + "learning_rate": 2.8298811397150217e-06, + "loss": 0.3588, + "step": 11975 + }, + { + "epoch": 2.04, + "grad_norm": 9.382054822171709, + "learning_rate": 2.825426775252806e-06, + "loss": 0.3578, + "step": 11980 + }, + { + "epoch": 2.04, + "grad_norm": 5.611521064546941, + "learning_rate": 2.8209745382342312e-06, + "loss": 0.3558, + "step": 11985 + }, + { + "epoch": 2.04, + "grad_norm": 5.068563602349961, + "learning_rate": 2.8165244330150383e-06, + "loss": 0.364, + "step": 11990 + }, + { + "epoch": 2.04, + "grad_norm": 4.357350781394905, + "learning_rate": 2.812076463948884e-06, + "loss": 0.3672, + "step": 11995 + }, + { + "epoch": 2.04, + "grad_norm": 6.591605468812917, + "learning_rate": 2.8076306353873413e-06, + "loss": 0.3651, + "step": 12000 + }, + { + "epoch": 2.04, + "grad_norm": 5.106233634138435, + "learning_rate": 2.8031869516798794e-06, + "loss": 0.3523, + "step": 12005 + }, + { + "epoch": 2.04, + "grad_norm": 5.718458446762907, + "learning_rate": 2.798745417173877e-06, + "loss": 0.3632, + "step": 12010 + }, + { + "epoch": 2.04, + "grad_norm": 6.2380488137176675, + "learning_rate": 2.7943060362146068e-06, + "loss": 0.357, + "step": 12015 + }, + { + "epoch": 2.04, + "grad_norm": 13.238093997370028, + "learning_rate": 2.7898688131452344e-06, + "loss": 0.3616, + "step": 12020 + }, + { + "epoch": 2.04, + "grad_norm": 4.566161035481949, + "learning_rate": 2.785433752306812e-06, + "loss": 0.3568, + "step": 12025 + }, + { + "epoch": 2.05, + "grad_norm": 5.614923482223124, + "learning_rate": 2.7810008580382843e-06, + "loss": 0.358, + "step": 12030 + }, + { + "epoch": 2.05, + "grad_norm": 4.801101666626508, + "learning_rate": 2.776570134676469e-06, + "loss": 0.3627, + "step": 12035 + }, + { + "epoch": 2.05, + "grad_norm": 5.401932712886612, + "learning_rate": 2.77214158655606e-06, + "loss": 0.3577, + "step": 12040 + }, + { + "epoch": 2.05, + "grad_norm": 5.629093893600516, + "learning_rate": 2.76771521800963e-06, + "loss": 0.3602, + "step": 12045 + }, + { + "epoch": 2.05, + "grad_norm": 8.84615246777452, + "learning_rate": 2.76329103336761e-06, + "loss": 0.3629, + "step": 12050 + }, + { + "epoch": 2.05, + "grad_norm": 6.493679745481419, + "learning_rate": 2.7588690369583025e-06, + "loss": 0.3584, + "step": 12055 + }, + { + "epoch": 2.05, + "grad_norm": 4.650688020597786, + "learning_rate": 2.7544492331078667e-06, + "loss": 0.3573, + "step": 12060 + }, + { + "epoch": 2.05, + "grad_norm": 4.875759976969528, + "learning_rate": 2.750031626140313e-06, + "loss": 0.3633, + "step": 12065 + }, + { + "epoch": 2.05, + "grad_norm": 12.228597460057165, + "learning_rate": 2.745616220377504e-06, + "loss": 0.3627, + "step": 12070 + }, + { + "epoch": 2.05, + "grad_norm": 12.359671405062235, + "learning_rate": 2.7412030201391553e-06, + "loss": 0.3614, + "step": 12075 + }, + { + "epoch": 2.05, + "grad_norm": 6.22622924385044, + "learning_rate": 2.7367920297428174e-06, + "loss": 0.3591, + "step": 12080 + }, + { + "epoch": 2.05, + "grad_norm": 9.388816381649836, + "learning_rate": 2.7323832535038787e-06, + "loss": 0.3547, + "step": 12085 + }, + { + "epoch": 2.06, + "grad_norm": 13.70592439616937, + "learning_rate": 2.727976695735568e-06, + "loss": 0.3613, + "step": 12090 + }, + { + "epoch": 2.06, + "grad_norm": 8.429611413044588, + "learning_rate": 2.7235723607489357e-06, + "loss": 0.3639, + "step": 12095 + }, + { + "epoch": 2.06, + "grad_norm": 5.446747254918879, + "learning_rate": 2.719170252852868e-06, + "loss": 0.3565, + "step": 12100 + }, + { + "epoch": 2.06, + "grad_norm": 9.694603382556856, + "learning_rate": 2.7147703763540567e-06, + "loss": 0.3575, + "step": 12105 + }, + { + "epoch": 2.06, + "grad_norm": 10.344482002591908, + "learning_rate": 2.7103727355570277e-06, + "loss": 0.3599, + "step": 12110 + }, + { + "epoch": 2.06, + "grad_norm": 7.230043253110429, + "learning_rate": 2.7059773347641048e-06, + "loss": 0.3783, + "step": 12115 + }, + { + "epoch": 2.06, + "grad_norm": 4.911178835416952, + "learning_rate": 2.701584178275433e-06, + "loss": 0.3507, + "step": 12120 + }, + { + "epoch": 2.06, + "grad_norm": 7.060856281038126, + "learning_rate": 2.6971932703889534e-06, + "loss": 0.3577, + "step": 12125 + }, + { + "epoch": 2.06, + "grad_norm": 5.095825633746809, + "learning_rate": 2.6928046154004083e-06, + "loss": 0.348, + "step": 12130 + }, + { + "epoch": 2.06, + "grad_norm": 5.33022918116732, + "learning_rate": 2.6884182176033397e-06, + "loss": 0.3597, + "step": 12135 + }, + { + "epoch": 2.06, + "grad_norm": 7.14854869257719, + "learning_rate": 2.684034081289078e-06, + "loss": 0.3654, + "step": 12140 + }, + { + "epoch": 2.06, + "grad_norm": 7.175699068398426, + "learning_rate": 2.6796522107467417e-06, + "loss": 0.3471, + "step": 12145 + }, + { + "epoch": 2.07, + "grad_norm": 11.87073571076359, + "learning_rate": 2.6752726102632307e-06, + "loss": 0.3646, + "step": 12150 + }, + { + "epoch": 2.07, + "grad_norm": 5.265772381937061, + "learning_rate": 2.670895284123231e-06, + "loss": 0.3583, + "step": 12155 + }, + { + "epoch": 2.07, + "grad_norm": 9.51482295186804, + "learning_rate": 2.666520236609196e-06, + "loss": 0.3474, + "step": 12160 + }, + { + "epoch": 2.07, + "grad_norm": 8.52752360844317, + "learning_rate": 2.662147472001352e-06, + "loss": 0.3687, + "step": 12165 + }, + { + "epoch": 2.07, + "grad_norm": 5.657318428452209, + "learning_rate": 2.6577769945776942e-06, + "loss": 0.3708, + "step": 12170 + }, + { + "epoch": 2.07, + "grad_norm": 14.585897856994992, + "learning_rate": 2.653408808613977e-06, + "loss": 0.3748, + "step": 12175 + }, + { + "epoch": 2.07, + "grad_norm": 12.455744504231912, + "learning_rate": 2.6490429183837195e-06, + "loss": 0.3574, + "step": 12180 + }, + { + "epoch": 2.07, + "grad_norm": 6.3858811970077705, + "learning_rate": 2.6446793281581815e-06, + "loss": 0.356, + "step": 12185 + }, + { + "epoch": 2.07, + "grad_norm": 13.118397592546135, + "learning_rate": 2.640318042206387e-06, + "loss": 0.3654, + "step": 12190 + }, + { + "epoch": 2.07, + "grad_norm": 9.898867934283263, + "learning_rate": 2.635959064795097e-06, + "loss": 0.3513, + "step": 12195 + }, + { + "epoch": 2.07, + "grad_norm": 5.454220363751431, + "learning_rate": 2.6316024001888195e-06, + "loss": 0.3684, + "step": 12200 + }, + { + "epoch": 2.07, + "grad_norm": 5.251292384038412, + "learning_rate": 2.6272480526497952e-06, + "loss": 0.3517, + "step": 12205 + }, + { + "epoch": 2.08, + "grad_norm": 9.71362624669713, + "learning_rate": 2.622896026437998e-06, + "loss": 0.3635, + "step": 12210 + }, + { + "epoch": 2.08, + "grad_norm": 6.455864586011419, + "learning_rate": 2.6185463258111355e-06, + "loss": 0.3568, + "step": 12215 + }, + { + "epoch": 2.08, + "grad_norm": 6.345032911435628, + "learning_rate": 2.614198955024637e-06, + "loss": 0.3621, + "step": 12220 + }, + { + "epoch": 2.08, + "grad_norm": 7.085325440928504, + "learning_rate": 2.6098539183316508e-06, + "loss": 0.3572, + "step": 12225 + }, + { + "epoch": 2.08, + "grad_norm": 8.755306263476994, + "learning_rate": 2.6055112199830423e-06, + "loss": 0.3632, + "step": 12230 + }, + { + "epoch": 2.08, + "grad_norm": 4.37675672449101, + "learning_rate": 2.601170864227395e-06, + "loss": 0.362, + "step": 12235 + }, + { + "epoch": 2.08, + "grad_norm": 10.004487280961618, + "learning_rate": 2.5968328553109912e-06, + "loss": 0.3607, + "step": 12240 + }, + { + "epoch": 2.08, + "grad_norm": 11.383232716403265, + "learning_rate": 2.5924971974778257e-06, + "loss": 0.3695, + "step": 12245 + }, + { + "epoch": 2.08, + "grad_norm": 6.952935057045326, + "learning_rate": 2.5881638949695886e-06, + "loss": 0.3528, + "step": 12250 + }, + { + "epoch": 2.08, + "grad_norm": 6.056211499559283, + "learning_rate": 2.5838329520256645e-06, + "loss": 0.3605, + "step": 12255 + }, + { + "epoch": 2.08, + "grad_norm": 8.277901307181128, + "learning_rate": 2.579504372883134e-06, + "loss": 0.3646, + "step": 12260 + }, + { + "epoch": 2.09, + "grad_norm": 6.69486470616986, + "learning_rate": 2.575178161776763e-06, + "loss": 0.3647, + "step": 12265 + }, + { + "epoch": 2.09, + "grad_norm": 9.170144636951997, + "learning_rate": 2.5708543229389995e-06, + "loss": 0.3556, + "step": 12270 + }, + { + "epoch": 2.09, + "grad_norm": 4.872922231980344, + "learning_rate": 2.5665328605999696e-06, + "loss": 0.3726, + "step": 12275 + }, + { + "epoch": 2.09, + "grad_norm": 7.288978068586903, + "learning_rate": 2.5622137789874803e-06, + "loss": 0.3619, + "step": 12280 + }, + { + "epoch": 2.09, + "grad_norm": 5.131810848831009, + "learning_rate": 2.557897082327002e-06, + "loss": 0.3658, + "step": 12285 + }, + { + "epoch": 2.09, + "grad_norm": 4.919657846426334, + "learning_rate": 2.5535827748416797e-06, + "loss": 0.3698, + "step": 12290 + }, + { + "epoch": 2.09, + "grad_norm": 5.168193650629757, + "learning_rate": 2.5492708607523144e-06, + "loss": 0.3661, + "step": 12295 + }, + { + "epoch": 2.09, + "grad_norm": 10.285402235676456, + "learning_rate": 2.544961344277368e-06, + "loss": 0.3745, + "step": 12300 + }, + { + "epoch": 2.09, + "grad_norm": 10.514162836792714, + "learning_rate": 2.540654229632955e-06, + "loss": 0.3721, + "step": 12305 + }, + { + "epoch": 2.09, + "grad_norm": 6.014041099796996, + "learning_rate": 2.536349521032846e-06, + "loss": 0.3486, + "step": 12310 + }, + { + "epoch": 2.09, + "grad_norm": 11.404746919512409, + "learning_rate": 2.5320472226884506e-06, + "loss": 0.3553, + "step": 12315 + }, + { + "epoch": 2.09, + "grad_norm": 9.926602435491768, + "learning_rate": 2.527747338808822e-06, + "loss": 0.3705, + "step": 12320 + }, + { + "epoch": 2.1, + "grad_norm": 7.43243287500068, + "learning_rate": 2.5234498736006563e-06, + "loss": 0.3758, + "step": 12325 + }, + { + "epoch": 2.1, + "grad_norm": 4.5893380700500135, + "learning_rate": 2.5191548312682758e-06, + "loss": 0.3685, + "step": 12330 + }, + { + "epoch": 2.1, + "grad_norm": 5.739196866534467, + "learning_rate": 2.5148622160136406e-06, + "loss": 0.3644, + "step": 12335 + }, + { + "epoch": 2.1, + "grad_norm": 7.223888941673534, + "learning_rate": 2.5105720320363287e-06, + "loss": 0.3566, + "step": 12340 + }, + { + "epoch": 2.1, + "grad_norm": 4.760901049999837, + "learning_rate": 2.5062842835335442e-06, + "loss": 0.3517, + "step": 12345 + }, + { + "epoch": 2.1, + "grad_norm": 8.717510780154683, + "learning_rate": 2.5019989747001043e-06, + "loss": 0.3576, + "step": 12350 + }, + { + "epoch": 2.1, + "grad_norm": 9.274190474502552, + "learning_rate": 2.4977161097284468e-06, + "loss": 0.3555, + "step": 12355 + }, + { + "epoch": 2.1, + "grad_norm": 5.661456821603926, + "learning_rate": 2.4934356928086124e-06, + "loss": 0.3526, + "step": 12360 + }, + { + "epoch": 2.1, + "grad_norm": 13.14206177144619, + "learning_rate": 2.489157728128245e-06, + "loss": 0.3536, + "step": 12365 + }, + { + "epoch": 2.1, + "grad_norm": 4.8359452582871425, + "learning_rate": 2.4848822198725974e-06, + "loss": 0.3545, + "step": 12370 + }, + { + "epoch": 2.1, + "grad_norm": 5.562661402570011, + "learning_rate": 2.4806091722245114e-06, + "loss": 0.3557, + "step": 12375 + }, + { + "epoch": 2.1, + "grad_norm": 7.216730692799468, + "learning_rate": 2.4763385893644283e-06, + "loss": 0.3527, + "step": 12380 + }, + { + "epoch": 2.11, + "grad_norm": 6.175993021978491, + "learning_rate": 2.4720704754703715e-06, + "loss": 0.3705, + "step": 12385 + }, + { + "epoch": 2.11, + "grad_norm": 11.558130455617105, + "learning_rate": 2.467804834717954e-06, + "loss": 0.3611, + "step": 12390 + }, + { + "epoch": 2.11, + "grad_norm": 12.155852876216986, + "learning_rate": 2.4635416712803635e-06, + "loss": 0.3584, + "step": 12395 + }, + { + "epoch": 2.11, + "grad_norm": 5.056660813979677, + "learning_rate": 2.4592809893283725e-06, + "loss": 0.3523, + "step": 12400 + }, + { + "epoch": 2.11, + "grad_norm": 12.155316694768354, + "learning_rate": 2.455022793030319e-06, + "loss": 0.351, + "step": 12405 + }, + { + "epoch": 2.11, + "grad_norm": 6.137254627367353, + "learning_rate": 2.4507670865521093e-06, + "loss": 0.3555, + "step": 12410 + }, + { + "epoch": 2.11, + "grad_norm": 11.335378484644814, + "learning_rate": 2.446513874057219e-06, + "loss": 0.3475, + "step": 12415 + }, + { + "epoch": 2.11, + "grad_norm": 7.202345783395633, + "learning_rate": 2.442263159706678e-06, + "loss": 0.3627, + "step": 12420 + }, + { + "epoch": 2.11, + "grad_norm": 5.040729939592325, + "learning_rate": 2.4380149476590805e-06, + "loss": 0.362, + "step": 12425 + }, + { + "epoch": 2.11, + "grad_norm": 14.663934834177535, + "learning_rate": 2.4337692420705578e-06, + "loss": 0.352, + "step": 12430 + }, + { + "epoch": 2.11, + "grad_norm": 5.6804959838619205, + "learning_rate": 2.4295260470948058e-06, + "loss": 0.3551, + "step": 12435 + }, + { + "epoch": 2.11, + "grad_norm": 8.268339570358487, + "learning_rate": 2.425285366883053e-06, + "loss": 0.3601, + "step": 12440 + }, + { + "epoch": 2.12, + "grad_norm": 6.822741932012296, + "learning_rate": 2.42104720558407e-06, + "loss": 0.3484, + "step": 12445 + }, + { + "epoch": 2.12, + "grad_norm": 9.813642685372308, + "learning_rate": 2.416811567344169e-06, + "loss": 0.3679, + "step": 12450 + }, + { + "epoch": 2.12, + "grad_norm": 10.837042565036516, + "learning_rate": 2.4125784563071843e-06, + "loss": 0.3476, + "step": 12455 + }, + { + "epoch": 2.12, + "grad_norm": 4.782218813101773, + "learning_rate": 2.4083478766144863e-06, + "loss": 0.3606, + "step": 12460 + }, + { + "epoch": 2.12, + "grad_norm": 11.059837450405126, + "learning_rate": 2.4041198324049634e-06, + "loss": 0.3672, + "step": 12465 + }, + { + "epoch": 2.12, + "grad_norm": 8.250155523400553, + "learning_rate": 2.3998943278150265e-06, + "loss": 0.3653, + "step": 12470 + }, + { + "epoch": 2.12, + "grad_norm": 8.595823286695708, + "learning_rate": 2.3956713669785974e-06, + "loss": 0.3572, + "step": 12475 + }, + { + "epoch": 2.12, + "grad_norm": 9.43108384835417, + "learning_rate": 2.391450954027117e-06, + "loss": 0.3578, + "step": 12480 + }, + { + "epoch": 2.12, + "grad_norm": 4.888909316789247, + "learning_rate": 2.387233093089527e-06, + "loss": 0.3716, + "step": 12485 + }, + { + "epoch": 2.12, + "grad_norm": 4.484941394571873, + "learning_rate": 2.383017788292273e-06, + "loss": 0.3568, + "step": 12490 + }, + { + "epoch": 2.12, + "grad_norm": 4.746259770066021, + "learning_rate": 2.3788050437593042e-06, + "loss": 0.3651, + "step": 12495 + }, + { + "epoch": 2.13, + "grad_norm": 4.9129131917140505, + "learning_rate": 2.37459486361206e-06, + "loss": 0.3515, + "step": 12500 + }, + { + "epoch": 2.13, + "grad_norm": 6.071685633633804, + "learning_rate": 2.370387251969477e-06, + "loss": 0.3692, + "step": 12505 + }, + { + "epoch": 2.13, + "grad_norm": 4.804804003375496, + "learning_rate": 2.366182212947969e-06, + "loss": 0.3534, + "step": 12510 + }, + { + "epoch": 2.13, + "grad_norm": 8.700029141854401, + "learning_rate": 2.3619797506614447e-06, + "loss": 0.3575, + "step": 12515 + }, + { + "epoch": 2.13, + "grad_norm": 4.871791337356873, + "learning_rate": 2.3577798692212817e-06, + "loss": 0.3479, + "step": 12520 + }, + { + "epoch": 2.13, + "grad_norm": 9.445874880209733, + "learning_rate": 2.353582572736342e-06, + "loss": 0.3593, + "step": 12525 + }, + { + "epoch": 2.13, + "grad_norm": 6.388800020380725, + "learning_rate": 2.349387865312951e-06, + "loss": 0.3607, + "step": 12530 + }, + { + "epoch": 2.13, + "grad_norm": 7.061201502997077, + "learning_rate": 2.3451957510549034e-06, + "loss": 0.3547, + "step": 12535 + }, + { + "epoch": 2.13, + "grad_norm": 4.40790926491953, + "learning_rate": 2.34100623406346e-06, + "loss": 0.3569, + "step": 12540 + }, + { + "epoch": 2.13, + "grad_norm": 6.112629901368969, + "learning_rate": 2.336819318437338e-06, + "loss": 0.3571, + "step": 12545 + }, + { + "epoch": 2.13, + "grad_norm": 5.5893900154448986, + "learning_rate": 2.3326350082727093e-06, + "loss": 0.3549, + "step": 12550 + }, + { + "epoch": 2.13, + "grad_norm": 5.773355334844262, + "learning_rate": 2.3284533076631954e-06, + "loss": 0.3628, + "step": 12555 + }, + { + "epoch": 2.14, + "grad_norm": 5.176848576237044, + "learning_rate": 2.3242742206998703e-06, + "loss": 0.3525, + "step": 12560 + }, + { + "epoch": 2.14, + "grad_norm": 5.064208261078508, + "learning_rate": 2.3200977514712434e-06, + "loss": 0.3646, + "step": 12565 + }, + { + "epoch": 2.14, + "grad_norm": 8.572635391436043, + "learning_rate": 2.3159239040632725e-06, + "loss": 0.3496, + "step": 12570 + }, + { + "epoch": 2.14, + "grad_norm": 6.203301780474356, + "learning_rate": 2.3117526825593417e-06, + "loss": 0.3509, + "step": 12575 + }, + { + "epoch": 2.14, + "grad_norm": 9.019304670820222, + "learning_rate": 2.307584091040268e-06, + "loss": 0.3532, + "step": 12580 + }, + { + "epoch": 2.14, + "grad_norm": 11.743250315644167, + "learning_rate": 2.303418133584301e-06, + "loss": 0.3549, + "step": 12585 + }, + { + "epoch": 2.14, + "grad_norm": 4.598745561581672, + "learning_rate": 2.299254814267107e-06, + "loss": 0.3494, + "step": 12590 + }, + { + "epoch": 2.14, + "grad_norm": 10.03454149898803, + "learning_rate": 2.295094137161774e-06, + "loss": 0.3588, + "step": 12595 + }, + { + "epoch": 2.14, + "grad_norm": 4.63513008315468, + "learning_rate": 2.2909361063388024e-06, + "loss": 0.3514, + "step": 12600 + }, + { + "epoch": 2.14, + "grad_norm": 12.429596505739113, + "learning_rate": 2.28678072586611e-06, + "loss": 0.3504, + "step": 12605 + }, + { + "epoch": 2.14, + "grad_norm": 7.747186379508115, + "learning_rate": 2.282627999809014e-06, + "loss": 0.3517, + "step": 12610 + }, + { + "epoch": 2.14, + "grad_norm": 11.766026429105853, + "learning_rate": 2.2784779322302408e-06, + "loss": 0.3515, + "step": 12615 + }, + { + "epoch": 2.15, + "grad_norm": 8.19796998489907, + "learning_rate": 2.274330527189913e-06, + "loss": 0.3563, + "step": 12620 + }, + { + "epoch": 2.15, + "grad_norm": 5.687175841710557, + "learning_rate": 2.2701857887455482e-06, + "loss": 0.3553, + "step": 12625 + }, + { + "epoch": 2.15, + "grad_norm": 5.492044808621296, + "learning_rate": 2.266043720952054e-06, + "loss": 0.346, + "step": 12630 + }, + { + "epoch": 2.15, + "grad_norm": 8.698079597137065, + "learning_rate": 2.2619043278617307e-06, + "loss": 0.355, + "step": 12635 + }, + { + "epoch": 2.15, + "grad_norm": 6.647526337896694, + "learning_rate": 2.2577676135242566e-06, + "loss": 0.356, + "step": 12640 + }, + { + "epoch": 2.15, + "grad_norm": 6.7027116634400175, + "learning_rate": 2.253633581986689e-06, + "loss": 0.3624, + "step": 12645 + }, + { + "epoch": 2.15, + "grad_norm": 5.394766540445826, + "learning_rate": 2.249502237293466e-06, + "loss": 0.3621, + "step": 12650 + }, + { + "epoch": 2.15, + "grad_norm": 5.1860900535938, + "learning_rate": 2.2453735834863897e-06, + "loss": 0.3479, + "step": 12655 + }, + { + "epoch": 2.15, + "grad_norm": 10.678739729089573, + "learning_rate": 2.2412476246046377e-06, + "loss": 0.3508, + "step": 12660 + }, + { + "epoch": 2.15, + "grad_norm": 7.542513089344192, + "learning_rate": 2.2371243646847444e-06, + "loss": 0.3579, + "step": 12665 + }, + { + "epoch": 2.15, + "grad_norm": 5.907712371792205, + "learning_rate": 2.233003807760607e-06, + "loss": 0.3553, + "step": 12670 + }, + { + "epoch": 2.15, + "grad_norm": 7.918275784958399, + "learning_rate": 2.228885957863477e-06, + "loss": 0.3488, + "step": 12675 + }, + { + "epoch": 2.16, + "grad_norm": 5.0345251496741605, + "learning_rate": 2.224770819021956e-06, + "loss": 0.3516, + "step": 12680 + }, + { + "epoch": 2.16, + "grad_norm": 6.360579403001685, + "learning_rate": 2.220658395261999e-06, + "loss": 0.3505, + "step": 12685 + }, + { + "epoch": 2.16, + "grad_norm": 5.071208825906468, + "learning_rate": 2.216548690606898e-06, + "loss": 0.3503, + "step": 12690 + }, + { + "epoch": 2.16, + "grad_norm": 7.3340027233657255, + "learning_rate": 2.2124417090772903e-06, + "loss": 0.3498, + "step": 12695 + }, + { + "epoch": 2.16, + "grad_norm": 5.058112027557419, + "learning_rate": 2.2083374546911444e-06, + "loss": 0.364, + "step": 12700 + }, + { + "epoch": 2.16, + "grad_norm": 4.4348235788957675, + "learning_rate": 2.2042359314637683e-06, + "loss": 0.3618, + "step": 12705 + }, + { + "epoch": 2.16, + "grad_norm": 9.226367642118475, + "learning_rate": 2.200137143407785e-06, + "loss": 0.3644, + "step": 12710 + }, + { + "epoch": 2.16, + "grad_norm": 4.612977427099218, + "learning_rate": 2.196041094533155e-06, + "loss": 0.3588, + "step": 12715 + }, + { + "epoch": 2.16, + "grad_norm": 6.7396878974458385, + "learning_rate": 2.1919477888471522e-06, + "loss": 0.3558, + "step": 12720 + }, + { + "epoch": 2.16, + "grad_norm": 4.955526064834778, + "learning_rate": 2.1878572303543655e-06, + "loss": 0.3601, + "step": 12725 + }, + { + "epoch": 2.16, + "grad_norm": 14.447660951252212, + "learning_rate": 2.183769423056702e-06, + "loss": 0.3562, + "step": 12730 + }, + { + "epoch": 2.17, + "grad_norm": 7.579752616290158, + "learning_rate": 2.179684370953371e-06, + "loss": 0.3577, + "step": 12735 + }, + { + "epoch": 2.17, + "grad_norm": 5.6468210027675445, + "learning_rate": 2.175602078040892e-06, + "loss": 0.352, + "step": 12740 + }, + { + "epoch": 2.17, + "grad_norm": 4.578563303804438, + "learning_rate": 2.1715225483130815e-06, + "loss": 0.3456, + "step": 12745 + }, + { + "epoch": 2.17, + "grad_norm": 5.508536921531665, + "learning_rate": 2.167445785761052e-06, + "loss": 0.358, + "step": 12750 + }, + { + "epoch": 2.17, + "grad_norm": 4.501753862781778, + "learning_rate": 2.1633717943732098e-06, + "loss": 0.3413, + "step": 12755 + }, + { + "epoch": 2.17, + "grad_norm": 6.196928032996506, + "learning_rate": 2.159300578135254e-06, + "loss": 0.3581, + "step": 12760 + }, + { + "epoch": 2.17, + "grad_norm": 6.074066358859239, + "learning_rate": 2.1552321410301626e-06, + "loss": 0.3529, + "step": 12765 + }, + { + "epoch": 2.17, + "grad_norm": 8.809753435779445, + "learning_rate": 2.1511664870381956e-06, + "loss": 0.3591, + "step": 12770 + }, + { + "epoch": 2.17, + "grad_norm": 4.725861909837628, + "learning_rate": 2.1471036201368968e-06, + "loss": 0.352, + "step": 12775 + }, + { + "epoch": 2.17, + "grad_norm": 5.314575776197405, + "learning_rate": 2.1430435443010733e-06, + "loss": 0.3479, + "step": 12780 + }, + { + "epoch": 2.17, + "grad_norm": 4.603148181328639, + "learning_rate": 2.1389862635028136e-06, + "loss": 0.352, + "step": 12785 + }, + { + "epoch": 2.17, + "grad_norm": 6.357563224881423, + "learning_rate": 2.134931781711457e-06, + "loss": 0.3524, + "step": 12790 + }, + { + "epoch": 2.18, + "grad_norm": 5.679609149108052, + "learning_rate": 2.130880102893618e-06, + "loss": 0.3547, + "step": 12795 + }, + { + "epoch": 2.18, + "grad_norm": 4.653289583363171, + "learning_rate": 2.126831231013159e-06, + "loss": 0.3433, + "step": 12800 + }, + { + "epoch": 2.18, + "grad_norm": 5.150465354689677, + "learning_rate": 2.122785170031205e-06, + "loss": 0.3553, + "step": 12805 + }, + { + "epoch": 2.18, + "grad_norm": 6.49152039269724, + "learning_rate": 2.118741923906125e-06, + "loss": 0.3545, + "step": 12810 + }, + { + "epoch": 2.18, + "grad_norm": 4.968638433195927, + "learning_rate": 2.1147014965935327e-06, + "loss": 0.3511, + "step": 12815 + }, + { + "epoch": 2.18, + "grad_norm": 15.476562767724293, + "learning_rate": 2.110663892046292e-06, + "loss": 0.3475, + "step": 12820 + }, + { + "epoch": 2.18, + "grad_norm": 5.193980384967873, + "learning_rate": 2.1066291142144978e-06, + "loss": 0.3561, + "step": 12825 + }, + { + "epoch": 2.18, + "grad_norm": 20.28983778856591, + "learning_rate": 2.1025971670454827e-06, + "loss": 0.3646, + "step": 12830 + }, + { + "epoch": 2.18, + "grad_norm": 4.287610844059453, + "learning_rate": 2.098568054483807e-06, + "loss": 0.3655, + "step": 12835 + }, + { + "epoch": 2.18, + "grad_norm": 21.597390203424133, + "learning_rate": 2.094541780471264e-06, + "loss": 0.3608, + "step": 12840 + }, + { + "epoch": 2.18, + "grad_norm": 13.785556614757647, + "learning_rate": 2.0905183489468623e-06, + "loss": 0.3641, + "step": 12845 + }, + { + "epoch": 2.18, + "grad_norm": 7.368036043761198, + "learning_rate": 2.0864977638468376e-06, + "loss": 0.3596, + "step": 12850 + }, + { + "epoch": 2.19, + "grad_norm": 5.858919450916035, + "learning_rate": 2.0824800291046347e-06, + "loss": 0.371, + "step": 12855 + }, + { + "epoch": 2.19, + "grad_norm": 13.328796201165607, + "learning_rate": 2.07846514865091e-06, + "loss": 0.3434, + "step": 12860 + }, + { + "epoch": 2.19, + "grad_norm": 4.384829402670883, + "learning_rate": 2.0744531264135327e-06, + "loss": 0.3554, + "step": 12865 + }, + { + "epoch": 2.19, + "grad_norm": 12.8962393661322, + "learning_rate": 2.0704439663175714e-06, + "loss": 0.3517, + "step": 12870 + }, + { + "epoch": 2.19, + "grad_norm": 4.3887445041630295, + "learning_rate": 2.0664376722852948e-06, + "loss": 0.3555, + "step": 12875 + }, + { + "epoch": 2.19, + "grad_norm": 19.531962902066855, + "learning_rate": 2.0624342482361664e-06, + "loss": 0.3501, + "step": 12880 + }, + { + "epoch": 2.19, + "grad_norm": 5.102304932018561, + "learning_rate": 2.058433698086848e-06, + "loss": 0.3382, + "step": 12885 + }, + { + "epoch": 2.19, + "grad_norm": 14.280312009210922, + "learning_rate": 2.0544360257511826e-06, + "loss": 0.3578, + "step": 12890 + }, + { + "epoch": 2.19, + "grad_norm": 5.072495666599368, + "learning_rate": 2.050441235140203e-06, + "loss": 0.3507, + "step": 12895 + }, + { + "epoch": 2.19, + "grad_norm": 4.7319798260058095, + "learning_rate": 2.046449330162121e-06, + "loss": 0.3427, + "step": 12900 + }, + { + "epoch": 2.19, + "grad_norm": 4.997556168618728, + "learning_rate": 2.0424603147223228e-06, + "loss": 0.3487, + "step": 12905 + }, + { + "epoch": 2.19, + "grad_norm": 4.668830722747984, + "learning_rate": 2.0384741927233687e-06, + "loss": 0.3471, + "step": 12910 + }, + { + "epoch": 2.2, + "grad_norm": 4.498365158807844, + "learning_rate": 2.0344909680649937e-06, + "loss": 0.3444, + "step": 12915 + }, + { + "epoch": 2.2, + "grad_norm": 4.035491184131442, + "learning_rate": 2.030510644644091e-06, + "loss": 0.3531, + "step": 12920 + }, + { + "epoch": 2.2, + "grad_norm": 4.3573911283481035, + "learning_rate": 2.0265332263547175e-06, + "loss": 0.3546, + "step": 12925 + }, + { + "epoch": 2.2, + "grad_norm": 12.465513201544523, + "learning_rate": 2.022558717088092e-06, + "loss": 0.351, + "step": 12930 + }, + { + "epoch": 2.2, + "grad_norm": 4.656423916603554, + "learning_rate": 2.01858712073258e-06, + "loss": 0.3478, + "step": 12935 + }, + { + "epoch": 2.2, + "grad_norm": 5.118832543339823, + "learning_rate": 2.0146184411737057e-06, + "loss": 0.3521, + "step": 12940 + }, + { + "epoch": 2.2, + "grad_norm": 5.423815170561722, + "learning_rate": 2.0106526822941336e-06, + "loss": 0.3537, + "step": 12945 + }, + { + "epoch": 2.2, + "grad_norm": 4.497609982111329, + "learning_rate": 2.006689847973672e-06, + "loss": 0.362, + "step": 12950 + }, + { + "epoch": 2.2, + "grad_norm": 8.724143543231287, + "learning_rate": 2.0027299420892687e-06, + "loss": 0.3605, + "step": 12955 + }, + { + "epoch": 2.2, + "grad_norm": 6.545635461939133, + "learning_rate": 1.9987729685150054e-06, + "loss": 0.3477, + "step": 12960 + }, + { + "epoch": 2.2, + "grad_norm": 8.329174384014586, + "learning_rate": 1.994818931122099e-06, + "loss": 0.353, + "step": 12965 + }, + { + "epoch": 2.21, + "grad_norm": 5.631646120301006, + "learning_rate": 1.9908678337788866e-06, + "loss": 0.3623, + "step": 12970 + }, + { + "epoch": 2.21, + "grad_norm": 4.587745944032032, + "learning_rate": 1.9869196803508383e-06, + "loss": 0.3512, + "step": 12975 + }, + { + "epoch": 2.21, + "grad_norm": 4.614550001666569, + "learning_rate": 1.9829744747005355e-06, + "loss": 0.3486, + "step": 12980 + }, + { + "epoch": 2.21, + "grad_norm": 6.188568299003901, + "learning_rate": 1.979032220687683e-06, + "loss": 0.3576, + "step": 12985 + }, + { + "epoch": 2.21, + "grad_norm": 5.165804827286981, + "learning_rate": 1.975092922169089e-06, + "loss": 0.3532, + "step": 12990 + }, + { + "epoch": 2.21, + "grad_norm": 5.097043978976538, + "learning_rate": 1.9711565829986795e-06, + "loss": 0.3489, + "step": 12995 + }, + { + "epoch": 2.21, + "grad_norm": 4.939939440373387, + "learning_rate": 1.9672232070274803e-06, + "loss": 0.3582, + "step": 13000 + }, + { + "epoch": 2.21, + "grad_norm": 5.601542152733169, + "learning_rate": 1.963292798103617e-06, + "loss": 0.3549, + "step": 13005 + }, + { + "epoch": 2.21, + "grad_norm": 10.330609981982413, + "learning_rate": 1.9593653600723184e-06, + "loss": 0.3459, + "step": 13010 + }, + { + "epoch": 2.21, + "grad_norm": 5.109084999537791, + "learning_rate": 1.9554408967758996e-06, + "loss": 0.3554, + "step": 13015 + }, + { + "epoch": 2.21, + "grad_norm": 5.586639449236359, + "learning_rate": 1.951519412053772e-06, + "loss": 0.3496, + "step": 13020 + }, + { + "epoch": 2.21, + "grad_norm": 8.917383245650884, + "learning_rate": 1.947600909742427e-06, + "loss": 0.3497, + "step": 13025 + }, + { + "epoch": 2.22, + "grad_norm": 5.944441252305898, + "learning_rate": 1.9436853936754456e-06, + "loss": 0.3571, + "step": 13030 + }, + { + "epoch": 2.22, + "grad_norm": 15.326786665243585, + "learning_rate": 1.9397728676834772e-06, + "loss": 0.3525, + "step": 13035 + }, + { + "epoch": 2.22, + "grad_norm": 5.722319121428556, + "learning_rate": 1.9358633355942547e-06, + "loss": 0.3523, + "step": 13040 + }, + { + "epoch": 2.22, + "grad_norm": 4.987853647705775, + "learning_rate": 1.9319568012325785e-06, + "loss": 0.346, + "step": 13045 + }, + { + "epoch": 2.22, + "grad_norm": 4.821900932918403, + "learning_rate": 1.928053268420314e-06, + "loss": 0.3535, + "step": 13050 + }, + { + "epoch": 2.22, + "grad_norm": 5.759054880221154, + "learning_rate": 1.924152740976397e-06, + "loss": 0.3507, + "step": 13055 + }, + { + "epoch": 2.22, + "grad_norm": 4.184172045514366, + "learning_rate": 1.920255222716815e-06, + "loss": 0.3547, + "step": 13060 + }, + { + "epoch": 2.22, + "grad_norm": 8.550200708516813, + "learning_rate": 1.916360717454618e-06, + "loss": 0.3586, + "step": 13065 + }, + { + "epoch": 2.22, + "grad_norm": 7.201643974584935, + "learning_rate": 1.9124692289999043e-06, + "loss": 0.3534, + "step": 13070 + }, + { + "epoch": 2.22, + "grad_norm": 8.145930153650735, + "learning_rate": 1.908580761159822e-06, + "loss": 0.3493, + "step": 13075 + }, + { + "epoch": 2.22, + "grad_norm": 6.265935915615525, + "learning_rate": 1.9046953177385623e-06, + "loss": 0.3474, + "step": 13080 + }, + { + "epoch": 2.22, + "grad_norm": 5.750015111410342, + "learning_rate": 1.9008129025373629e-06, + "loss": 0.3522, + "step": 13085 + }, + { + "epoch": 2.23, + "grad_norm": 4.615266133527854, + "learning_rate": 1.896933519354493e-06, + "loss": 0.3465, + "step": 13090 + }, + { + "epoch": 2.23, + "grad_norm": 5.064289801459999, + "learning_rate": 1.893057171985257e-06, + "loss": 0.3433, + "step": 13095 + }, + { + "epoch": 2.23, + "grad_norm": 4.437705557970718, + "learning_rate": 1.889183864221993e-06, + "loss": 0.3432, + "step": 13100 + }, + { + "epoch": 2.23, + "grad_norm": 4.890599420576279, + "learning_rate": 1.885313599854059e-06, + "loss": 0.3489, + "step": 13105 + }, + { + "epoch": 2.23, + "grad_norm": 5.3124975455109755, + "learning_rate": 1.8814463826678442e-06, + "loss": 0.3511, + "step": 13110 + }, + { + "epoch": 2.23, + "grad_norm": 9.871965635586927, + "learning_rate": 1.877582216446745e-06, + "loss": 0.3502, + "step": 13115 + }, + { + "epoch": 2.23, + "grad_norm": 4.646315847922393, + "learning_rate": 1.873721104971184e-06, + "loss": 0.3505, + "step": 13120 + }, + { + "epoch": 2.23, + "grad_norm": 6.027378772777941, + "learning_rate": 1.8698630520185874e-06, + "loss": 0.3479, + "step": 13125 + }, + { + "epoch": 2.23, + "grad_norm": 5.038666599099896, + "learning_rate": 1.8660080613633963e-06, + "loss": 0.347, + "step": 13130 + }, + { + "epoch": 2.23, + "grad_norm": 5.010673079253129, + "learning_rate": 1.8621561367770497e-06, + "loss": 0.3529, + "step": 13135 + }, + { + "epoch": 2.23, + "grad_norm": 8.602362535722602, + "learning_rate": 1.8583072820279885e-06, + "loss": 0.3444, + "step": 13140 + }, + { + "epoch": 2.23, + "grad_norm": 4.896038325823834, + "learning_rate": 1.8544615008816536e-06, + "loss": 0.3442, + "step": 13145 + }, + { + "epoch": 2.24, + "grad_norm": 5.876220617037956, + "learning_rate": 1.8506187971004753e-06, + "loss": 0.3434, + "step": 13150 + }, + { + "epoch": 2.24, + "grad_norm": 5.195201460399498, + "learning_rate": 1.8467791744438735e-06, + "loss": 0.3522, + "step": 13155 + }, + { + "epoch": 2.24, + "grad_norm": 5.816129760139536, + "learning_rate": 1.842942636668254e-06, + "loss": 0.3514, + "step": 13160 + }, + { + "epoch": 2.24, + "grad_norm": 4.810474650882271, + "learning_rate": 1.8391091875270083e-06, + "loss": 0.3541, + "step": 13165 + }, + { + "epoch": 2.24, + "grad_norm": 5.278313176164161, + "learning_rate": 1.8352788307704994e-06, + "loss": 0.3551, + "step": 13170 + }, + { + "epoch": 2.24, + "grad_norm": 5.719003770806738, + "learning_rate": 1.8314515701460728e-06, + "loss": 0.3526, + "step": 13175 + }, + { + "epoch": 2.24, + "grad_norm": 4.4429678197703275, + "learning_rate": 1.8276274093980378e-06, + "loss": 0.3465, + "step": 13180 + }, + { + "epoch": 2.24, + "grad_norm": 8.14256452509486, + "learning_rate": 1.823806352267673e-06, + "loss": 0.3431, + "step": 13185 + }, + { + "epoch": 2.24, + "grad_norm": 4.746051066675634, + "learning_rate": 1.8199884024932269e-06, + "loss": 0.3499, + "step": 13190 + }, + { + "epoch": 2.24, + "grad_norm": 7.986311949760354, + "learning_rate": 1.8161735638098954e-06, + "loss": 0.3448, + "step": 13195 + }, + { + "epoch": 2.24, + "grad_norm": 6.914570318914096, + "learning_rate": 1.8123618399498443e-06, + "loss": 0.3507, + "step": 13200 + }, + { + "epoch": 2.24, + "grad_norm": 7.1579141005865, + "learning_rate": 1.8085532346421813e-06, + "loss": 0.3493, + "step": 13205 + }, + { + "epoch": 2.25, + "grad_norm": 6.884851366646707, + "learning_rate": 1.8047477516129714e-06, + "loss": 0.3462, + "step": 13210 + }, + { + "epoch": 2.25, + "grad_norm": 4.922576484520524, + "learning_rate": 1.800945394585218e-06, + "loss": 0.3484, + "step": 13215 + }, + { + "epoch": 2.25, + "grad_norm": 5.022729298673702, + "learning_rate": 1.797146167278873e-06, + "loss": 0.3559, + "step": 13220 + }, + { + "epoch": 2.25, + "grad_norm": 10.15404561937025, + "learning_rate": 1.7933500734108217e-06, + "loss": 0.3461, + "step": 13225 + }, + { + "epoch": 2.25, + "grad_norm": 4.677057476274528, + "learning_rate": 1.7895571166948839e-06, + "loss": 0.3531, + "step": 13230 + }, + { + "epoch": 2.25, + "grad_norm": 16.877583132023236, + "learning_rate": 1.7857673008418126e-06, + "loss": 0.3499, + "step": 13235 + }, + { + "epoch": 2.25, + "grad_norm": 5.918368999806285, + "learning_rate": 1.7819806295592846e-06, + "loss": 0.3479, + "step": 13240 + }, + { + "epoch": 2.25, + "grad_norm": 13.074015583450297, + "learning_rate": 1.778197106551906e-06, + "loss": 0.3469, + "step": 13245 + }, + { + "epoch": 2.25, + "grad_norm": 5.204338553008846, + "learning_rate": 1.7744167355211967e-06, + "loss": 0.3437, + "step": 13250 + }, + { + "epoch": 2.25, + "grad_norm": 11.941548788313813, + "learning_rate": 1.770639520165598e-06, + "loss": 0.3347, + "step": 13255 + }, + { + "epoch": 2.25, + "grad_norm": 4.311730600707324, + "learning_rate": 1.7668654641804583e-06, + "loss": 0.3517, + "step": 13260 + }, + { + "epoch": 2.26, + "grad_norm": 6.954675892706177, + "learning_rate": 1.7630945712580427e-06, + "loss": 0.3657, + "step": 13265 + }, + { + "epoch": 2.26, + "grad_norm": 4.517930593178529, + "learning_rate": 1.7593268450875145e-06, + "loss": 0.3505, + "step": 13270 + }, + { + "epoch": 2.26, + "grad_norm": 10.857689798245556, + "learning_rate": 1.7555622893549429e-06, + "loss": 0.3502, + "step": 13275 + }, + { + "epoch": 2.26, + "grad_norm": 6.640722053004046, + "learning_rate": 1.751800907743294e-06, + "loss": 0.3531, + "step": 13280 + }, + { + "epoch": 2.26, + "grad_norm": 7.540364031508203, + "learning_rate": 1.7480427039324266e-06, + "loss": 0.3543, + "step": 13285 + }, + { + "epoch": 2.26, + "grad_norm": 6.191867604546337, + "learning_rate": 1.7442876815990972e-06, + "loss": 0.3579, + "step": 13290 + }, + { + "epoch": 2.26, + "grad_norm": 10.730074278228338, + "learning_rate": 1.7405358444169413e-06, + "loss": 0.3585, + "step": 13295 + }, + { + "epoch": 2.26, + "grad_norm": 6.655593162182091, + "learning_rate": 1.7367871960564865e-06, + "loss": 0.3635, + "step": 13300 + }, + { + "epoch": 2.26, + "grad_norm": 4.232812680915814, + "learning_rate": 1.7330417401851317e-06, + "loss": 0.3463, + "step": 13305 + }, + { + "epoch": 2.26, + "grad_norm": 4.764333836098204, + "learning_rate": 1.7292994804671648e-06, + "loss": 0.3489, + "step": 13310 + }, + { + "epoch": 2.26, + "grad_norm": 4.560495803368249, + "learning_rate": 1.7255604205637305e-06, + "loss": 0.3567, + "step": 13315 + }, + { + "epoch": 2.26, + "grad_norm": 6.599711992803756, + "learning_rate": 1.7218245641328585e-06, + "loss": 0.3439, + "step": 13320 + }, + { + "epoch": 2.27, + "grad_norm": 6.5763943704902665, + "learning_rate": 1.7180919148294356e-06, + "loss": 0.3478, + "step": 13325 + }, + { + "epoch": 2.27, + "grad_norm": 6.192559204919637, + "learning_rate": 1.7143624763052113e-06, + "loss": 0.3555, + "step": 13330 + }, + { + "epoch": 2.27, + "grad_norm": 6.530792324752998, + "learning_rate": 1.7106362522088e-06, + "loss": 0.3484, + "step": 13335 + }, + { + "epoch": 2.27, + "grad_norm": 5.030676755391766, + "learning_rate": 1.7069132461856636e-06, + "loss": 0.3502, + "step": 13340 + }, + { + "epoch": 2.27, + "grad_norm": 5.3288817707798675, + "learning_rate": 1.703193461878122e-06, + "loss": 0.3426, + "step": 13345 + }, + { + "epoch": 2.27, + "grad_norm": 6.86473762505183, + "learning_rate": 1.69947690292534e-06, + "loss": 0.3425, + "step": 13350 + }, + { + "epoch": 2.27, + "grad_norm": 6.179694085709815, + "learning_rate": 1.6957635729633265e-06, + "loss": 0.3411, + "step": 13355 + }, + { + "epoch": 2.27, + "grad_norm": 4.590706960514583, + "learning_rate": 1.6920534756249313e-06, + "loss": 0.3386, + "step": 13360 + }, + { + "epoch": 2.27, + "grad_norm": 6.085792274530593, + "learning_rate": 1.6883466145398458e-06, + "loss": 0.3517, + "step": 13365 + }, + { + "epoch": 2.27, + "grad_norm": 9.457605435397486, + "learning_rate": 1.6846429933345909e-06, + "loss": 0.3608, + "step": 13370 + }, + { + "epoch": 2.27, + "grad_norm": 4.232165753807701, + "learning_rate": 1.6809426156325165e-06, + "loss": 0.3487, + "step": 13375 + }, + { + "epoch": 2.27, + "grad_norm": 5.420720267383381, + "learning_rate": 1.6772454850538062e-06, + "loss": 0.3393, + "step": 13380 + }, + { + "epoch": 2.28, + "grad_norm": 7.003590199085304, + "learning_rate": 1.6735516052154581e-06, + "loss": 0.3444, + "step": 13385 + }, + { + "epoch": 2.28, + "grad_norm": 4.89964111274835, + "learning_rate": 1.6698609797313015e-06, + "loss": 0.3614, + "step": 13390 + }, + { + "epoch": 2.28, + "grad_norm": 4.51153800199699, + "learning_rate": 1.666173612211966e-06, + "loss": 0.3566, + "step": 13395 + }, + { + "epoch": 2.28, + "grad_norm": 4.191882587472384, + "learning_rate": 1.6624895062649087e-06, + "loss": 0.3313, + "step": 13400 + }, + { + "epoch": 2.28, + "grad_norm": 5.991928299959772, + "learning_rate": 1.658808665494387e-06, + "loss": 0.3519, + "step": 13405 + }, + { + "epoch": 2.28, + "grad_norm": 4.8209245667833525, + "learning_rate": 1.6551310935014686e-06, + "loss": 0.3414, + "step": 13410 + }, + { + "epoch": 2.28, + "grad_norm": 8.351080819575392, + "learning_rate": 1.6514567938840215e-06, + "loss": 0.3518, + "step": 13415 + }, + { + "epoch": 2.28, + "grad_norm": 5.231585925437265, + "learning_rate": 1.6477857702367088e-06, + "loss": 0.3517, + "step": 13420 + }, + { + "epoch": 2.28, + "grad_norm": 6.649497456792598, + "learning_rate": 1.644118026150996e-06, + "loss": 0.3473, + "step": 13425 + }, + { + "epoch": 2.28, + "grad_norm": 6.598271100066865, + "learning_rate": 1.640453565215135e-06, + "loss": 0.3471, + "step": 13430 + }, + { + "epoch": 2.28, + "grad_norm": 4.560216547673767, + "learning_rate": 1.636792391014166e-06, + "loss": 0.3402, + "step": 13435 + }, + { + "epoch": 2.28, + "grad_norm": 4.525873620362791, + "learning_rate": 1.6331345071299126e-06, + "loss": 0.3444, + "step": 13440 + }, + { + "epoch": 2.29, + "grad_norm": 4.65064534196253, + "learning_rate": 1.6294799171409847e-06, + "loss": 0.3358, + "step": 13445 + }, + { + "epoch": 2.29, + "grad_norm": 4.884507828410077, + "learning_rate": 1.6258286246227639e-06, + "loss": 0.3444, + "step": 13450 + }, + { + "epoch": 2.29, + "grad_norm": 5.0941968873682155, + "learning_rate": 1.6221806331474105e-06, + "loss": 0.3451, + "step": 13455 + }, + { + "epoch": 2.29, + "grad_norm": 5.836567699217456, + "learning_rate": 1.6185359462838517e-06, + "loss": 0.342, + "step": 13460 + }, + { + "epoch": 2.29, + "grad_norm": 6.717508618110533, + "learning_rate": 1.614894567597781e-06, + "loss": 0.3408, + "step": 13465 + }, + { + "epoch": 2.29, + "grad_norm": 4.417072529234567, + "learning_rate": 1.6112565006516628e-06, + "loss": 0.348, + "step": 13470 + }, + { + "epoch": 2.29, + "grad_norm": 8.232586277623032, + "learning_rate": 1.6076217490047092e-06, + "loss": 0.3471, + "step": 13475 + }, + { + "epoch": 2.29, + "grad_norm": 12.738691740688402, + "learning_rate": 1.6039903162129005e-06, + "loss": 0.346, + "step": 13480 + }, + { + "epoch": 2.29, + "grad_norm": 4.476783163143757, + "learning_rate": 1.6003622058289625e-06, + "loss": 0.339, + "step": 13485 + }, + { + "epoch": 2.29, + "grad_norm": 6.1786516010413735, + "learning_rate": 1.5967374214023767e-06, + "loss": 0.3426, + "step": 13490 + }, + { + "epoch": 2.29, + "grad_norm": 9.240119993457492, + "learning_rate": 1.5931159664793638e-06, + "loss": 0.3576, + "step": 13495 + }, + { + "epoch": 2.3, + "grad_norm": 4.617265515344786, + "learning_rate": 1.5894978446028948e-06, + "loss": 0.3571, + "step": 13500 + }, + { + "epoch": 2.3, + "grad_norm": 8.100899996233863, + "learning_rate": 1.5858830593126733e-06, + "loss": 0.3476, + "step": 13505 + }, + { + "epoch": 2.3, + "grad_norm": 5.180130314731739, + "learning_rate": 1.582271614145142e-06, + "loss": 0.3479, + "step": 13510 + }, + { + "epoch": 2.3, + "grad_norm": 4.490433584866408, + "learning_rate": 1.5786635126334748e-06, + "loss": 0.3493, + "step": 13515 + }, + { + "epoch": 2.3, + "grad_norm": 4.815477303917275, + "learning_rate": 1.5750587583075732e-06, + "loss": 0.3326, + "step": 13520 + }, + { + "epoch": 2.3, + "grad_norm": 4.583652687710298, + "learning_rate": 1.5714573546940692e-06, + "loss": 0.3457, + "step": 13525 + }, + { + "epoch": 2.3, + "grad_norm": 4.454232968768451, + "learning_rate": 1.5678593053163093e-06, + "loss": 0.3405, + "step": 13530 + }, + { + "epoch": 2.3, + "grad_norm": 8.43772017874306, + "learning_rate": 1.5642646136943657e-06, + "loss": 0.3352, + "step": 13535 + }, + { + "epoch": 2.3, + "grad_norm": 7.122617314101993, + "learning_rate": 1.5606732833450189e-06, + "loss": 0.3398, + "step": 13540 + }, + { + "epoch": 2.3, + "grad_norm": 4.665676662010156, + "learning_rate": 1.5570853177817675e-06, + "loss": 0.3396, + "step": 13545 + }, + { + "epoch": 2.3, + "grad_norm": 4.511840143186782, + "learning_rate": 1.5535007205148134e-06, + "loss": 0.3554, + "step": 13550 + }, + { + "epoch": 2.3, + "grad_norm": 4.798262124458812, + "learning_rate": 1.549919495051065e-06, + "loss": 0.3454, + "step": 13555 + }, + { + "epoch": 2.31, + "grad_norm": 9.802067833224381, + "learning_rate": 1.546341644894131e-06, + "loss": 0.3409, + "step": 13560 + }, + { + "epoch": 2.31, + "grad_norm": 6.21854918317779, + "learning_rate": 1.5427671735443179e-06, + "loss": 0.3396, + "step": 13565 + }, + { + "epoch": 2.31, + "grad_norm": 9.027154433511654, + "learning_rate": 1.5391960844986303e-06, + "loss": 0.3347, + "step": 13570 + }, + { + "epoch": 2.31, + "grad_norm": 5.9085323414991215, + "learning_rate": 1.5356283812507583e-06, + "loss": 0.3556, + "step": 13575 + }, + { + "epoch": 2.31, + "grad_norm": 5.97029483532059, + "learning_rate": 1.5320640672910847e-06, + "loss": 0.3553, + "step": 13580 + }, + { + "epoch": 2.31, + "grad_norm": 4.560186552875563, + "learning_rate": 1.5285031461066707e-06, + "loss": 0.3388, + "step": 13585 + }, + { + "epoch": 2.31, + "grad_norm": 4.753874546064374, + "learning_rate": 1.524945621181267e-06, + "loss": 0.3467, + "step": 13590 + }, + { + "epoch": 2.31, + "grad_norm": 5.343057377128513, + "learning_rate": 1.521391495995289e-06, + "loss": 0.3394, + "step": 13595 + }, + { + "epoch": 2.31, + "grad_norm": 4.353250063424364, + "learning_rate": 1.517840774025839e-06, + "loss": 0.3498, + "step": 13600 + }, + { + "epoch": 2.31, + "grad_norm": 7.217358273512686, + "learning_rate": 1.5142934587466818e-06, + "loss": 0.3369, + "step": 13605 + }, + { + "epoch": 2.31, + "grad_norm": 5.135377518107784, + "learning_rate": 1.5107495536282501e-06, + "loss": 0.3443, + "step": 13610 + }, + { + "epoch": 2.31, + "grad_norm": 4.732260437297867, + "learning_rate": 1.507209062137645e-06, + "loss": 0.3451, + "step": 13615 + }, + { + "epoch": 2.32, + "grad_norm": 4.850580660514156, + "learning_rate": 1.5036719877386219e-06, + "loss": 0.3456, + "step": 13620 + }, + { + "epoch": 2.32, + "grad_norm": 6.058081962272013, + "learning_rate": 1.5001383338915992e-06, + "loss": 0.3438, + "step": 13625 + }, + { + "epoch": 2.32, + "grad_norm": 5.18034489260238, + "learning_rate": 1.496608104053644e-06, + "loss": 0.342, + "step": 13630 + }, + { + "epoch": 2.32, + "grad_norm": 4.738934622372447, + "learning_rate": 1.4930813016784757e-06, + "loss": 0.3525, + "step": 13635 + }, + { + "epoch": 2.32, + "grad_norm": 7.446377218869251, + "learning_rate": 1.4895579302164582e-06, + "loss": 0.344, + "step": 13640 + }, + { + "epoch": 2.32, + "grad_norm": 5.134281629105313, + "learning_rate": 1.486037993114604e-06, + "loss": 0.3424, + "step": 13645 + }, + { + "epoch": 2.32, + "grad_norm": 6.422097460058927, + "learning_rate": 1.482521493816561e-06, + "loss": 0.3414, + "step": 13650 + }, + { + "epoch": 2.32, + "grad_norm": 8.43992691411807, + "learning_rate": 1.4790084357626144e-06, + "loss": 0.3343, + "step": 13655 + }, + { + "epoch": 2.32, + "grad_norm": 6.523273549487832, + "learning_rate": 1.475498822389686e-06, + "loss": 0.3338, + "step": 13660 + }, + { + "epoch": 2.32, + "grad_norm": 5.435185890787804, + "learning_rate": 1.4719926571313225e-06, + "loss": 0.3386, + "step": 13665 + }, + { + "epoch": 2.32, + "grad_norm": 8.036234796730652, + "learning_rate": 1.4684899434177042e-06, + "loss": 0.3418, + "step": 13670 + }, + { + "epoch": 2.32, + "grad_norm": 4.985559920161983, + "learning_rate": 1.4649906846756246e-06, + "loss": 0.3483, + "step": 13675 + }, + { + "epoch": 2.33, + "grad_norm": 9.198127442793323, + "learning_rate": 1.4614948843285075e-06, + "loss": 0.3383, + "step": 13680 + }, + { + "epoch": 2.33, + "grad_norm": 4.405724182538725, + "learning_rate": 1.4580025457963853e-06, + "loss": 0.3441, + "step": 13685 + }, + { + "epoch": 2.33, + "grad_norm": 9.447408149158544, + "learning_rate": 1.4545136724959103e-06, + "loss": 0.3393, + "step": 13690 + }, + { + "epoch": 2.33, + "grad_norm": 4.682546730853727, + "learning_rate": 1.4510282678403398e-06, + "loss": 0.3432, + "step": 13695 + }, + { + "epoch": 2.33, + "grad_norm": 5.311866051573056, + "learning_rate": 1.4475463352395375e-06, + "loss": 0.3409, + "step": 13700 + }, + { + "epoch": 2.33, + "grad_norm": 4.960718718240998, + "learning_rate": 1.4440678780999755e-06, + "loss": 0.3392, + "step": 13705 + }, + { + "epoch": 2.33, + "grad_norm": 5.020210423031231, + "learning_rate": 1.4405928998247198e-06, + "loss": 0.3457, + "step": 13710 + }, + { + "epoch": 2.33, + "grad_norm": 7.037044644399787, + "learning_rate": 1.4371214038134369e-06, + "loss": 0.3529, + "step": 13715 + }, + { + "epoch": 2.33, + "grad_norm": 6.904175449827646, + "learning_rate": 1.4336533934623815e-06, + "loss": 0.3381, + "step": 13720 + }, + { + "epoch": 2.33, + "grad_norm": 10.645039334534665, + "learning_rate": 1.4301888721644059e-06, + "loss": 0.3474, + "step": 13725 + }, + { + "epoch": 2.33, + "grad_norm": 4.999952519704578, + "learning_rate": 1.426727843308942e-06, + "loss": 0.3431, + "step": 13730 + }, + { + "epoch": 2.34, + "grad_norm": 5.703475806749105, + "learning_rate": 1.42327031028201e-06, + "loss": 0.3368, + "step": 13735 + }, + { + "epoch": 2.34, + "grad_norm": 7.277357316094131, + "learning_rate": 1.419816276466206e-06, + "loss": 0.338, + "step": 13740 + }, + { + "epoch": 2.34, + "grad_norm": 5.857459866364857, + "learning_rate": 1.4163657452407037e-06, + "loss": 0.3473, + "step": 13745 + }, + { + "epoch": 2.34, + "grad_norm": 5.60983307528463, + "learning_rate": 1.4129187199812539e-06, + "loss": 0.341, + "step": 13750 + }, + { + "epoch": 2.34, + "grad_norm": 5.131631743151706, + "learning_rate": 1.4094752040601722e-06, + "loss": 0.3404, + "step": 13755 + }, + { + "epoch": 2.34, + "grad_norm": 5.013967935247216, + "learning_rate": 1.4060352008463434e-06, + "loss": 0.3361, + "step": 13760 + }, + { + "epoch": 2.34, + "grad_norm": 4.727283192259992, + "learning_rate": 1.4025987137052138e-06, + "loss": 0.3404, + "step": 13765 + }, + { + "epoch": 2.34, + "grad_norm": 4.777124384197018, + "learning_rate": 1.399165745998795e-06, + "loss": 0.3486, + "step": 13770 + }, + { + "epoch": 2.34, + "grad_norm": 4.506027592709238, + "learning_rate": 1.3957363010856485e-06, + "loss": 0.3348, + "step": 13775 + }, + { + "epoch": 2.34, + "grad_norm": 5.325554245222836, + "learning_rate": 1.3923103823208956e-06, + "loss": 0.3505, + "step": 13780 + }, + { + "epoch": 2.34, + "grad_norm": 5.141132954617194, + "learning_rate": 1.3888879930562033e-06, + "loss": 0.3359, + "step": 13785 + }, + { + "epoch": 2.34, + "grad_norm": 4.567212775700831, + "learning_rate": 1.3854691366397866e-06, + "loss": 0.3457, + "step": 13790 + }, + { + "epoch": 2.35, + "grad_norm": 5.297810518315015, + "learning_rate": 1.3820538164164093e-06, + "loss": 0.3438, + "step": 13795 + }, + { + "epoch": 2.35, + "grad_norm": 9.281204002436738, + "learning_rate": 1.378642035727365e-06, + "loss": 0.3369, + "step": 13800 + }, + { + "epoch": 2.35, + "grad_norm": 4.36525808599186, + "learning_rate": 1.375233797910495e-06, + "loss": 0.3365, + "step": 13805 + }, + { + "epoch": 2.35, + "grad_norm": 4.4490501640437765, + "learning_rate": 1.3718291063001682e-06, + "loss": 0.3398, + "step": 13810 + }, + { + "epoch": 2.35, + "grad_norm": 5.715687665200049, + "learning_rate": 1.3684279642272885e-06, + "loss": 0.3455, + "step": 13815 + }, + { + "epoch": 2.35, + "grad_norm": 6.08889261493712, + "learning_rate": 1.3650303750192817e-06, + "loss": 0.344, + "step": 13820 + }, + { + "epoch": 2.35, + "grad_norm": 8.533607736470023, + "learning_rate": 1.361636342000105e-06, + "loss": 0.3381, + "step": 13825 + }, + { + "epoch": 2.35, + "grad_norm": 9.119921821708115, + "learning_rate": 1.35824586849023e-06, + "loss": 0.3522, + "step": 13830 + }, + { + "epoch": 2.35, + "grad_norm": 7.88098686226356, + "learning_rate": 1.3548589578066483e-06, + "loss": 0.3347, + "step": 13835 + }, + { + "epoch": 2.35, + "grad_norm": 8.405121362717537, + "learning_rate": 1.351475613262867e-06, + "loss": 0.343, + "step": 13840 + }, + { + "epoch": 2.35, + "grad_norm": 5.427892201945487, + "learning_rate": 1.3480958381689007e-06, + "loss": 0.3356, + "step": 13845 + }, + { + "epoch": 2.35, + "grad_norm": 5.109650437688848, + "learning_rate": 1.3447196358312785e-06, + "loss": 0.336, + "step": 13850 + }, + { + "epoch": 2.36, + "grad_norm": 6.899595498191738, + "learning_rate": 1.3413470095530267e-06, + "loss": 0.3448, + "step": 13855 + }, + { + "epoch": 2.36, + "grad_norm": 13.105610717701218, + "learning_rate": 1.3379779626336792e-06, + "loss": 0.3493, + "step": 13860 + }, + { + "epoch": 2.36, + "grad_norm": 4.477048054743193, + "learning_rate": 1.3346124983692633e-06, + "loss": 0.3459, + "step": 13865 + }, + { + "epoch": 2.36, + "grad_norm": 7.3026896755353565, + "learning_rate": 1.3312506200523056e-06, + "loss": 0.3442, + "step": 13870 + }, + { + "epoch": 2.36, + "grad_norm": 4.830324581300535, + "learning_rate": 1.3278923309718216e-06, + "loss": 0.3424, + "step": 13875 + }, + { + "epoch": 2.36, + "grad_norm": 5.589760157400933, + "learning_rate": 1.3245376344133154e-06, + "loss": 0.3367, + "step": 13880 + }, + { + "epoch": 2.36, + "grad_norm": 7.4284197300354275, + "learning_rate": 1.321186533658778e-06, + "loss": 0.345, + "step": 13885 + }, + { + "epoch": 2.36, + "grad_norm": 7.245485052632024, + "learning_rate": 1.3178390319866796e-06, + "loss": 0.343, + "step": 13890 + }, + { + "epoch": 2.36, + "grad_norm": 7.468765278758668, + "learning_rate": 1.314495132671974e-06, + "loss": 0.3358, + "step": 13895 + }, + { + "epoch": 2.36, + "grad_norm": 6.730884937980976, + "learning_rate": 1.3111548389860856e-06, + "loss": 0.3532, + "step": 13900 + }, + { + "epoch": 2.36, + "grad_norm": 6.02533076184626, + "learning_rate": 1.307818154196917e-06, + "loss": 0.3387, + "step": 13905 + }, + { + "epoch": 2.36, + "grad_norm": 5.535237415331175, + "learning_rate": 1.3044850815688336e-06, + "loss": 0.3372, + "step": 13910 + }, + { + "epoch": 2.37, + "grad_norm": 6.078079559136438, + "learning_rate": 1.3011556243626744e-06, + "loss": 0.3395, + "step": 13915 + }, + { + "epoch": 2.37, + "grad_norm": 5.862215623470651, + "learning_rate": 1.2978297858357319e-06, + "loss": 0.3286, + "step": 13920 + }, + { + "epoch": 2.37, + "grad_norm": 5.075655997097684, + "learning_rate": 1.294507569241768e-06, + "loss": 0.3476, + "step": 13925 + }, + { + "epoch": 2.37, + "grad_norm": 6.601342646915005, + "learning_rate": 1.291188977830995e-06, + "loss": 0.3412, + "step": 13930 + }, + { + "epoch": 2.37, + "grad_norm": 6.377658288362509, + "learning_rate": 1.2878740148500784e-06, + "loss": 0.3469, + "step": 13935 + }, + { + "epoch": 2.37, + "grad_norm": 10.988578510111449, + "learning_rate": 1.2845626835421405e-06, + "loss": 0.3386, + "step": 13940 + }, + { + "epoch": 2.37, + "grad_norm": 5.089807389276085, + "learning_rate": 1.2812549871467417e-06, + "loss": 0.3514, + "step": 13945 + }, + { + "epoch": 2.37, + "grad_norm": 8.255751972220425, + "learning_rate": 1.2779509288998937e-06, + "loss": 0.3441, + "step": 13950 + }, + { + "epoch": 2.37, + "grad_norm": 7.210664216579693, + "learning_rate": 1.2746505120340447e-06, + "loss": 0.3439, + "step": 13955 + }, + { + "epoch": 2.37, + "grad_norm": 8.13247241723326, + "learning_rate": 1.271353739778081e-06, + "loss": 0.3329, + "step": 13960 + }, + { + "epoch": 2.37, + "grad_norm": 4.318434472721774, + "learning_rate": 1.2680606153573233e-06, + "loss": 0.3416, + "step": 13965 + }, + { + "epoch": 2.38, + "grad_norm": 7.922606543380108, + "learning_rate": 1.264771141993526e-06, + "loss": 0.3374, + "step": 13970 + }, + { + "epoch": 2.38, + "grad_norm": 7.009083560944203, + "learning_rate": 1.261485322904869e-06, + "loss": 0.3347, + "step": 13975 + }, + { + "epoch": 2.38, + "grad_norm": 5.43461490473705, + "learning_rate": 1.2582031613059553e-06, + "loss": 0.3399, + "step": 13980 + }, + { + "epoch": 2.38, + "grad_norm": 7.245591319250889, + "learning_rate": 1.2549246604078164e-06, + "loss": 0.3403, + "step": 13985 + }, + { + "epoch": 2.38, + "grad_norm": 8.551751274701207, + "learning_rate": 1.2516498234178937e-06, + "loss": 0.3415, + "step": 13990 + }, + { + "epoch": 2.38, + "grad_norm": 4.776017027952166, + "learning_rate": 1.2483786535400538e-06, + "loss": 0.3407, + "step": 13995 + }, + { + "epoch": 2.38, + "grad_norm": 5.242890436893526, + "learning_rate": 1.2451111539745646e-06, + "loss": 0.3426, + "step": 14000 + }, + { + "epoch": 2.38, + "grad_norm": 5.3470389354152426, + "learning_rate": 1.2418473279181132e-06, + "loss": 0.3338, + "step": 14005 + }, + { + "epoch": 2.38, + "grad_norm": 6.534546174432391, + "learning_rate": 1.238587178563786e-06, + "loss": 0.3398, + "step": 14010 + }, + { + "epoch": 2.38, + "grad_norm": 6.389675063098023, + "learning_rate": 1.2353307091010775e-06, + "loss": 0.3376, + "step": 14015 + }, + { + "epoch": 2.38, + "grad_norm": 6.168606843433336, + "learning_rate": 1.2320779227158786e-06, + "loss": 0.3459, + "step": 14020 + }, + { + "epoch": 2.38, + "grad_norm": 4.196101054526336, + "learning_rate": 1.2288288225904766e-06, + "loss": 0.3403, + "step": 14025 + }, + { + "epoch": 2.39, + "grad_norm": 4.274286352545161, + "learning_rate": 1.225583411903556e-06, + "loss": 0.3325, + "step": 14030 + }, + { + "epoch": 2.39, + "grad_norm": 4.637182685870529, + "learning_rate": 1.2223416938301885e-06, + "loss": 0.3291, + "step": 14035 + }, + { + "epoch": 2.39, + "grad_norm": 4.649448721624295, + "learning_rate": 1.2191036715418347e-06, + "loss": 0.3358, + "step": 14040 + }, + { + "epoch": 2.39, + "grad_norm": 4.74179880034857, + "learning_rate": 1.2158693482063377e-06, + "loss": 0.3353, + "step": 14045 + }, + { + "epoch": 2.39, + "grad_norm": 6.631733274197324, + "learning_rate": 1.2126387269879254e-06, + "loss": 0.3378, + "step": 14050 + }, + { + "epoch": 2.39, + "grad_norm": 4.935487631839041, + "learning_rate": 1.2094118110471998e-06, + "loss": 0.3474, + "step": 14055 + }, + { + "epoch": 2.39, + "grad_norm": 3.9412089332004983, + "learning_rate": 1.2061886035411412e-06, + "loss": 0.3331, + "step": 14060 + }, + { + "epoch": 2.39, + "grad_norm": 5.5029694305110555, + "learning_rate": 1.202969107623101e-06, + "loss": 0.3372, + "step": 14065 + }, + { + "epoch": 2.39, + "grad_norm": 5.074013228108049, + "learning_rate": 1.1997533264427958e-06, + "loss": 0.3297, + "step": 14070 + }, + { + "epoch": 2.39, + "grad_norm": 6.3155285236036915, + "learning_rate": 1.1965412631463164e-06, + "loss": 0.3376, + "step": 14075 + }, + { + "epoch": 2.39, + "grad_norm": 4.655360501282674, + "learning_rate": 1.193332920876104e-06, + "loss": 0.3341, + "step": 14080 + }, + { + "epoch": 2.39, + "grad_norm": 6.8202273238215465, + "learning_rate": 1.190128302770972e-06, + "loss": 0.3317, + "step": 14085 + }, + { + "epoch": 2.4, + "grad_norm": 5.661250004762066, + "learning_rate": 1.1869274119660818e-06, + "loss": 0.3341, + "step": 14090 + }, + { + "epoch": 2.4, + "grad_norm": 4.750711228093705, + "learning_rate": 1.1837302515929526e-06, + "loss": 0.3319, + "step": 14095 + }, + { + "epoch": 2.4, + "grad_norm": 5.636421776212692, + "learning_rate": 1.180536824779452e-06, + "loss": 0.3328, + "step": 14100 + }, + { + "epoch": 2.4, + "grad_norm": 7.944300055635633, + "learning_rate": 1.177347134649796e-06, + "loss": 0.35, + "step": 14105 + }, + { + "epoch": 2.4, + "grad_norm": 5.783967696727534, + "learning_rate": 1.1741611843245448e-06, + "loss": 0.3384, + "step": 14110 + }, + { + "epoch": 2.4, + "grad_norm": 6.422475390958299, + "learning_rate": 1.1709789769205993e-06, + "loss": 0.3407, + "step": 14115 + }, + { + "epoch": 2.4, + "grad_norm": 6.936647200414145, + "learning_rate": 1.1678005155511984e-06, + "loss": 0.3398, + "step": 14120 + }, + { + "epoch": 2.4, + "grad_norm": 6.250589249776812, + "learning_rate": 1.164625803325915e-06, + "loss": 0.3458, + "step": 14125 + }, + { + "epoch": 2.4, + "grad_norm": 4.970889235761936, + "learning_rate": 1.1614548433506596e-06, + "loss": 0.3436, + "step": 14130 + }, + { + "epoch": 2.4, + "grad_norm": 4.836692761132451, + "learning_rate": 1.1582876387276636e-06, + "loss": 0.336, + "step": 14135 + }, + { + "epoch": 2.4, + "grad_norm": 9.944435936043519, + "learning_rate": 1.1551241925554923e-06, + "loss": 0.3367, + "step": 14140 + }, + { + "epoch": 2.4, + "grad_norm": 6.236576529492092, + "learning_rate": 1.1519645079290277e-06, + "loss": 0.3384, + "step": 14145 + }, + { + "epoch": 2.41, + "grad_norm": 4.470189006998729, + "learning_rate": 1.1488085879394773e-06, + "loss": 0.3485, + "step": 14150 + }, + { + "epoch": 2.41, + "grad_norm": 4.5188328073800585, + "learning_rate": 1.145656435674361e-06, + "loss": 0.3329, + "step": 14155 + }, + { + "epoch": 2.41, + "grad_norm": 4.352260040907254, + "learning_rate": 1.1425080542175143e-06, + "loss": 0.3341, + "step": 14160 + }, + { + "epoch": 2.41, + "grad_norm": 5.822775962292289, + "learning_rate": 1.1393634466490843e-06, + "loss": 0.3405, + "step": 14165 + }, + { + "epoch": 2.41, + "grad_norm": 4.361876934910723, + "learning_rate": 1.1362226160455237e-06, + "loss": 0.3344, + "step": 14170 + }, + { + "epoch": 2.41, + "grad_norm": 8.88786874074134, + "learning_rate": 1.1330855654795948e-06, + "loss": 0.3389, + "step": 14175 + }, + { + "epoch": 2.41, + "grad_norm": 14.262895017022414, + "learning_rate": 1.1299522980203554e-06, + "loss": 0.3286, + "step": 14180 + }, + { + "epoch": 2.41, + "grad_norm": 5.149422771083544, + "learning_rate": 1.1268228167331686e-06, + "loss": 0.3461, + "step": 14185 + }, + { + "epoch": 2.41, + "grad_norm": 4.574946646085239, + "learning_rate": 1.123697124679688e-06, + "loss": 0.349, + "step": 14190 + }, + { + "epoch": 2.41, + "grad_norm": 16.266679457171783, + "learning_rate": 1.120575224917866e-06, + "loss": 0.3411, + "step": 14195 + }, + { + "epoch": 2.41, + "grad_norm": 9.663923228409695, + "learning_rate": 1.1174571205019358e-06, + "loss": 0.3334, + "step": 14200 + }, + { + "epoch": 2.41, + "grad_norm": 6.356748299028564, + "learning_rate": 1.114342814482428e-06, + "loss": 0.3316, + "step": 14205 + }, + { + "epoch": 2.42, + "grad_norm": 6.616836478636501, + "learning_rate": 1.11123230990615e-06, + "loss": 0.3516, + "step": 14210 + }, + { + "epoch": 2.42, + "grad_norm": 4.507223327833415, + "learning_rate": 1.1081256098161913e-06, + "loss": 0.3408, + "step": 14215 + }, + { + "epoch": 2.42, + "grad_norm": 7.034187350742189, + "learning_rate": 1.1050227172519234e-06, + "loss": 0.3415, + "step": 14220 + }, + { + "epoch": 2.42, + "grad_norm": 15.689685827911726, + "learning_rate": 1.1019236352489865e-06, + "loss": 0.3412, + "step": 14225 + }, + { + "epoch": 2.42, + "grad_norm": 7.8734984726878245, + "learning_rate": 1.098828366839299e-06, + "loss": 0.3357, + "step": 14230 + }, + { + "epoch": 2.42, + "grad_norm": 4.382933277091549, + "learning_rate": 1.0957369150510445e-06, + "loss": 0.3475, + "step": 14235 + }, + { + "epoch": 2.42, + "grad_norm": 6.1833242818970815, + "learning_rate": 1.0926492829086728e-06, + "loss": 0.3384, + "step": 14240 + }, + { + "epoch": 2.42, + "grad_norm": 7.661831755769597, + "learning_rate": 1.089565473432897e-06, + "loss": 0.3314, + "step": 14245 + }, + { + "epoch": 2.42, + "grad_norm": 5.278366302398714, + "learning_rate": 1.086485489640694e-06, + "loss": 0.3407, + "step": 14250 + }, + { + "epoch": 2.42, + "grad_norm": 9.140543659429433, + "learning_rate": 1.0834093345452934e-06, + "loss": 0.3349, + "step": 14255 + }, + { + "epoch": 2.42, + "grad_norm": 4.1394738008804595, + "learning_rate": 1.0803370111561789e-06, + "loss": 0.3335, + "step": 14260 + }, + { + "epoch": 2.43, + "grad_norm": 4.4178291423734315, + "learning_rate": 1.0772685224790907e-06, + "loss": 0.3406, + "step": 14265 + }, + { + "epoch": 2.43, + "grad_norm": 6.519217722729064, + "learning_rate": 1.0742038715160108e-06, + "loss": 0.3403, + "step": 14270 + }, + { + "epoch": 2.43, + "grad_norm": 4.799302187562816, + "learning_rate": 1.0711430612651747e-06, + "loss": 0.3458, + "step": 14275 + }, + { + "epoch": 2.43, + "grad_norm": 4.851722917122433, + "learning_rate": 1.0680860947210492e-06, + "loss": 0.3329, + "step": 14280 + }, + { + "epoch": 2.43, + "grad_norm": 7.937356964578476, + "learning_rate": 1.065032974874352e-06, + "loss": 0.332, + "step": 14285 + }, + { + "epoch": 2.43, + "grad_norm": 5.096391120098633, + "learning_rate": 1.0619837047120296e-06, + "loss": 0.3375, + "step": 14290 + }, + { + "epoch": 2.43, + "grad_norm": 4.967495395371477, + "learning_rate": 1.0589382872172682e-06, + "loss": 0.3434, + "step": 14295 + }, + { + "epoch": 2.43, + "grad_norm": 6.419841773328182, + "learning_rate": 1.0558967253694802e-06, + "loss": 0.336, + "step": 14300 + }, + { + "epoch": 2.43, + "grad_norm": 6.0973018817246984, + "learning_rate": 1.0528590221443064e-06, + "loss": 0.3383, + "step": 14305 + }, + { + "epoch": 2.43, + "grad_norm": 5.074301199329289, + "learning_rate": 1.0498251805136162e-06, + "loss": 0.3361, + "step": 14310 + }, + { + "epoch": 2.43, + "grad_norm": 4.976987817402853, + "learning_rate": 1.0467952034454976e-06, + "loss": 0.334, + "step": 14315 + }, + { + "epoch": 2.43, + "grad_norm": 6.122772139785621, + "learning_rate": 1.0437690939042594e-06, + "loss": 0.3376, + "step": 14320 + }, + { + "epoch": 2.44, + "grad_norm": 4.564482802942002, + "learning_rate": 1.0407468548504234e-06, + "loss": 0.3302, + "step": 14325 + }, + { + "epoch": 2.44, + "grad_norm": 6.250238575968118, + "learning_rate": 1.0377284892407318e-06, + "loss": 0.3333, + "step": 14330 + }, + { + "epoch": 2.44, + "grad_norm": 5.218686261207107, + "learning_rate": 1.0347140000281297e-06, + "loss": 0.3353, + "step": 14335 + }, + { + "epoch": 2.44, + "grad_norm": 5.063182793310301, + "learning_rate": 1.0317033901617763e-06, + "loss": 0.3419, + "step": 14340 + }, + { + "epoch": 2.44, + "grad_norm": 4.656996118444409, + "learning_rate": 1.0286966625870304e-06, + "loss": 0.3317, + "step": 14345 + }, + { + "epoch": 2.44, + "grad_norm": 5.4213939092521874, + "learning_rate": 1.0256938202454536e-06, + "loss": 0.3429, + "step": 14350 + }, + { + "epoch": 2.44, + "grad_norm": 8.364522415132814, + "learning_rate": 1.022694866074812e-06, + "loss": 0.334, + "step": 14355 + }, + { + "epoch": 2.44, + "grad_norm": 4.928770765768932, + "learning_rate": 1.0196998030090577e-06, + "loss": 0.33, + "step": 14360 + }, + { + "epoch": 2.44, + "grad_norm": 6.018321446051179, + "learning_rate": 1.0167086339783455e-06, + "loss": 0.3488, + "step": 14365 + }, + { + "epoch": 2.44, + "grad_norm": 5.015085898456728, + "learning_rate": 1.0137213619090142e-06, + "loss": 0.336, + "step": 14370 + }, + { + "epoch": 2.44, + "grad_norm": 10.398286045146108, + "learning_rate": 1.0107379897235959e-06, + "loss": 0.356, + "step": 14375 + }, + { + "epoch": 2.44, + "grad_norm": 4.891907399568669, + "learning_rate": 1.0077585203408003e-06, + "loss": 0.3352, + "step": 14380 + }, + { + "epoch": 2.45, + "grad_norm": 6.407190738784242, + "learning_rate": 1.0047829566755262e-06, + "loss": 0.3306, + "step": 14385 + }, + { + "epoch": 2.45, + "grad_norm": 8.082744545042239, + "learning_rate": 1.001811301638846e-06, + "loss": 0.3399, + "step": 14390 + }, + { + "epoch": 2.45, + "grad_norm": 4.318854243071567, + "learning_rate": 9.988435581380102e-07, + "loss": 0.3395, + "step": 14395 + }, + { + "epoch": 2.45, + "grad_norm": 4.723240011964577, + "learning_rate": 9.95879729076442e-07, + "loss": 0.3308, + "step": 14400 + }, + { + "epoch": 2.45, + "grad_norm": 5.3405520836823985, + "learning_rate": 9.929198173537346e-07, + "loss": 0.3298, + "step": 14405 + }, + { + "epoch": 2.45, + "grad_norm": 5.770996377634228, + "learning_rate": 9.899638258656518e-07, + "loss": 0.3364, + "step": 14410 + }, + { + "epoch": 2.45, + "grad_norm": 4.911279512730939, + "learning_rate": 9.870117575041172e-07, + "loss": 0.3254, + "step": 14415 + }, + { + "epoch": 2.45, + "grad_norm": 5.785308442014827, + "learning_rate": 9.840636151572215e-07, + "loss": 0.3395, + "step": 14420 + }, + { + "epoch": 2.45, + "grad_norm": 4.7477711398773295, + "learning_rate": 9.811194017092086e-07, + "loss": 0.3422, + "step": 14425 + }, + { + "epoch": 2.45, + "grad_norm": 4.243548036699693, + "learning_rate": 9.781791200404855e-07, + "loss": 0.3408, + "step": 14430 + }, + { + "epoch": 2.45, + "grad_norm": 4.289453405665378, + "learning_rate": 9.752427730276076e-07, + "loss": 0.3391, + "step": 14435 + }, + { + "epoch": 2.45, + "grad_norm": 5.724613198135481, + "learning_rate": 9.723103635432823e-07, + "loss": 0.3361, + "step": 14440 + }, + { + "epoch": 2.46, + "grad_norm": 8.191455303424572, + "learning_rate": 9.693818944563644e-07, + "loss": 0.341, + "step": 14445 + }, + { + "epoch": 2.46, + "grad_norm": 11.469298515326782, + "learning_rate": 9.664573686318535e-07, + "loss": 0.3313, + "step": 14450 + }, + { + "epoch": 2.46, + "grad_norm": 5.138448172334449, + "learning_rate": 9.635367889308945e-07, + "loss": 0.3467, + "step": 14455 + }, + { + "epoch": 2.46, + "grad_norm": 4.280952234714455, + "learning_rate": 9.606201582107666e-07, + "loss": 0.3464, + "step": 14460 + }, + { + "epoch": 2.46, + "grad_norm": 4.780709911013356, + "learning_rate": 9.577074793248908e-07, + "loss": 0.3312, + "step": 14465 + }, + { + "epoch": 2.46, + "grad_norm": 4.421025545234076, + "learning_rate": 9.547987551228172e-07, + "loss": 0.3322, + "step": 14470 + }, + { + "epoch": 2.46, + "grad_norm": 4.595092239009826, + "learning_rate": 9.518939884502315e-07, + "loss": 0.3373, + "step": 14475 + }, + { + "epoch": 2.46, + "grad_norm": 6.702879081922312, + "learning_rate": 9.489931821489439e-07, + "loss": 0.3409, + "step": 14480 + }, + { + "epoch": 2.46, + "grad_norm": 5.4970994508148445, + "learning_rate": 9.460963390568922e-07, + "loss": 0.3347, + "step": 14485 + }, + { + "epoch": 2.46, + "grad_norm": 4.376402649743197, + "learning_rate": 9.432034620081349e-07, + "loss": 0.3371, + "step": 14490 + }, + { + "epoch": 2.46, + "grad_norm": 4.8537603318498395, + "learning_rate": 9.403145538328512e-07, + "loss": 0.3202, + "step": 14495 + }, + { + "epoch": 2.47, + "grad_norm": 4.55627346536457, + "learning_rate": 9.3742961735734e-07, + "loss": 0.3382, + "step": 14500 + }, + { + "epoch": 2.47, + "grad_norm": 4.337405894436932, + "learning_rate": 9.3454865540401e-07, + "loss": 0.3389, + "step": 14505 + }, + { + "epoch": 2.47, + "grad_norm": 5.558365503858435, + "learning_rate": 9.31671670791387e-07, + "loss": 0.3355, + "step": 14510 + }, + { + "epoch": 2.47, + "grad_norm": 4.468843945765404, + "learning_rate": 9.287986663340998e-07, + "loss": 0.3321, + "step": 14515 + }, + { + "epoch": 2.47, + "grad_norm": 8.908580862152249, + "learning_rate": 9.259296448428895e-07, + "loss": 0.3367, + "step": 14520 + }, + { + "epoch": 2.47, + "grad_norm": 5.877623694657907, + "learning_rate": 9.230646091245932e-07, + "loss": 0.3399, + "step": 14525 + }, + { + "epoch": 2.47, + "grad_norm": 4.82838509264817, + "learning_rate": 9.202035619821553e-07, + "loss": 0.3391, + "step": 14530 + }, + { + "epoch": 2.47, + "grad_norm": 7.028694803195866, + "learning_rate": 9.173465062146148e-07, + "loss": 0.3369, + "step": 14535 + }, + { + "epoch": 2.47, + "grad_norm": 4.237706071931036, + "learning_rate": 9.14493444617105e-07, + "loss": 0.334, + "step": 14540 + }, + { + "epoch": 2.47, + "grad_norm": 4.825383076615841, + "learning_rate": 9.11644379980855e-07, + "loss": 0.3352, + "step": 14545 + }, + { + "epoch": 2.47, + "grad_norm": 4.316499817256464, + "learning_rate": 9.087993150931801e-07, + "loss": 0.3326, + "step": 14550 + }, + { + "epoch": 2.47, + "grad_norm": 4.911143231324706, + "learning_rate": 9.05958252737486e-07, + "loss": 0.3298, + "step": 14555 + }, + { + "epoch": 2.48, + "grad_norm": 4.6358832021455445, + "learning_rate": 9.03121195693259e-07, + "loss": 0.332, + "step": 14560 + }, + { + "epoch": 2.48, + "grad_norm": 4.947430264832714, + "learning_rate": 9.002881467360692e-07, + "loss": 0.3344, + "step": 14565 + }, + { + "epoch": 2.48, + "grad_norm": 7.248449927175377, + "learning_rate": 8.974591086375634e-07, + "loss": 0.335, + "step": 14570 + }, + { + "epoch": 2.48, + "grad_norm": 5.458948458123161, + "learning_rate": 8.946340841654677e-07, + "loss": 0.331, + "step": 14575 + }, + { + "epoch": 2.48, + "grad_norm": 5.198133276647935, + "learning_rate": 8.918130760835797e-07, + "loss": 0.329, + "step": 14580 + }, + { + "epoch": 2.48, + "grad_norm": 4.712618679236314, + "learning_rate": 8.88996087151765e-07, + "loss": 0.3355, + "step": 14585 + }, + { + "epoch": 2.48, + "grad_norm": 5.002139102868113, + "learning_rate": 8.861831201259635e-07, + "loss": 0.3386, + "step": 14590 + }, + { + "epoch": 2.48, + "grad_norm": 5.133980115384712, + "learning_rate": 8.833741777581739e-07, + "loss": 0.3274, + "step": 14595 + }, + { + "epoch": 2.48, + "grad_norm": 8.734894342249994, + "learning_rate": 8.80569262796464e-07, + "loss": 0.333, + "step": 14600 + }, + { + "epoch": 2.48, + "grad_norm": 4.053567103499762, + "learning_rate": 8.777683779849527e-07, + "loss": 0.3423, + "step": 14605 + }, + { + "epoch": 2.48, + "grad_norm": 4.718447412100042, + "learning_rate": 8.749715260638247e-07, + "loss": 0.3273, + "step": 14610 + }, + { + "epoch": 2.48, + "grad_norm": 6.513812256605959, + "learning_rate": 8.721787097693141e-07, + "loss": 0.3336, + "step": 14615 + }, + { + "epoch": 2.49, + "grad_norm": 4.621238267112457, + "learning_rate": 8.693899318337095e-07, + "loss": 0.3302, + "step": 14620 + }, + { + "epoch": 2.49, + "grad_norm": 4.445122668669881, + "learning_rate": 8.666051949853472e-07, + "loss": 0.3257, + "step": 14625 + }, + { + "epoch": 2.49, + "grad_norm": 4.709037015599132, + "learning_rate": 8.638245019486091e-07, + "loss": 0.3338, + "step": 14630 + }, + { + "epoch": 2.49, + "grad_norm": 4.360940980024568, + "learning_rate": 8.610478554439244e-07, + "loss": 0.3343, + "step": 14635 + }, + { + "epoch": 2.49, + "grad_norm": 4.6143899100962, + "learning_rate": 8.582752581877607e-07, + "loss": 0.3326, + "step": 14640 + }, + { + "epoch": 2.49, + "grad_norm": 6.7097884600896, + "learning_rate": 8.555067128926236e-07, + "loss": 0.3355, + "step": 14645 + }, + { + "epoch": 2.49, + "grad_norm": 4.298828685434539, + "learning_rate": 8.52742222267055e-07, + "loss": 0.3291, + "step": 14650 + }, + { + "epoch": 2.49, + "grad_norm": 4.260336253602938, + "learning_rate": 8.499817890156331e-07, + "loss": 0.3317, + "step": 14655 + }, + { + "epoch": 2.49, + "grad_norm": 5.064517398086245, + "learning_rate": 8.47225415838962e-07, + "loss": 0.3272, + "step": 14660 + }, + { + "epoch": 2.49, + "grad_norm": 4.483713586833253, + "learning_rate": 8.44473105433678e-07, + "loss": 0.3346, + "step": 14665 + }, + { + "epoch": 2.49, + "grad_norm": 4.264668270536011, + "learning_rate": 8.417248604924394e-07, + "loss": 0.3399, + "step": 14670 + }, + { + "epoch": 2.49, + "grad_norm": 5.172403867722749, + "learning_rate": 8.389806837039272e-07, + "loss": 0.3337, + "step": 14675 + }, + { + "epoch": 2.5, + "grad_norm": 4.8912479936973945, + "learning_rate": 8.362405777528471e-07, + "loss": 0.3433, + "step": 14680 + }, + { + "epoch": 2.5, + "grad_norm": 5.494072929459606, + "learning_rate": 8.335045453199142e-07, + "loss": 0.3418, + "step": 14685 + }, + { + "epoch": 2.5, + "grad_norm": 5.405357055704501, + "learning_rate": 8.307725890818658e-07, + "loss": 0.3388, + "step": 14690 + }, + { + "epoch": 2.5, + "grad_norm": 7.187527389254743, + "learning_rate": 8.280447117114465e-07, + "loss": 0.3323, + "step": 14695 + }, + { + "epoch": 2.5, + "grad_norm": 4.665466163749549, + "learning_rate": 8.25320915877415e-07, + "loss": 0.3339, + "step": 14700 + }, + { + "epoch": 2.5, + "grad_norm": 4.553292854497034, + "learning_rate": 8.226012042445308e-07, + "loss": 0.3361, + "step": 14705 + }, + { + "epoch": 2.5, + "grad_norm": 6.328542620083238, + "learning_rate": 8.198855794735644e-07, + "loss": 0.3329, + "step": 14710 + }, + { + "epoch": 2.5, + "grad_norm": 5.279384402377715, + "learning_rate": 8.171740442212833e-07, + "loss": 0.3492, + "step": 14715 + }, + { + "epoch": 2.5, + "grad_norm": 5.645996678991626, + "learning_rate": 8.144666011404556e-07, + "loss": 0.3399, + "step": 14720 + }, + { + "epoch": 2.5, + "grad_norm": 4.832297637021295, + "learning_rate": 8.117632528798458e-07, + "loss": 0.3485, + "step": 14725 + }, + { + "epoch": 2.5, + "grad_norm": 4.637719837256791, + "learning_rate": 8.090640020842117e-07, + "loss": 0.3327, + "step": 14730 + }, + { + "epoch": 2.51, + "grad_norm": 4.937432153179189, + "learning_rate": 8.063688513943046e-07, + "loss": 0.3298, + "step": 14735 + }, + { + "epoch": 2.51, + "grad_norm": 5.247228681295939, + "learning_rate": 8.036778034468617e-07, + "loss": 0.3348, + "step": 14740 + }, + { + "epoch": 2.51, + "grad_norm": 4.601109205992861, + "learning_rate": 8.009908608746097e-07, + "loss": 0.3265, + "step": 14745 + }, + { + "epoch": 2.51, + "grad_norm": 7.610534797221197, + "learning_rate": 7.983080263062542e-07, + "loss": 0.3333, + "step": 14750 + }, + { + "epoch": 2.51, + "grad_norm": 6.139040766800028, + "learning_rate": 7.956293023664879e-07, + "loss": 0.3309, + "step": 14755 + }, + { + "epoch": 2.51, + "grad_norm": 11.412769842768002, + "learning_rate": 7.929546916759772e-07, + "loss": 0.3306, + "step": 14760 + }, + { + "epoch": 2.51, + "grad_norm": 7.884387501900604, + "learning_rate": 7.902841968513652e-07, + "loss": 0.3341, + "step": 14765 + }, + { + "epoch": 2.51, + "grad_norm": 6.846825276989522, + "learning_rate": 7.876178205052698e-07, + "loss": 0.3415, + "step": 14770 + }, + { + "epoch": 2.51, + "grad_norm": 7.665868376155784, + "learning_rate": 7.849555652462775e-07, + "loss": 0.3269, + "step": 14775 + }, + { + "epoch": 2.51, + "grad_norm": 4.423483816559562, + "learning_rate": 7.822974336789468e-07, + "loss": 0.3184, + "step": 14780 + }, + { + "epoch": 2.51, + "grad_norm": 5.625052189667723, + "learning_rate": 7.796434284037973e-07, + "loss": 0.3375, + "step": 14785 + }, + { + "epoch": 2.51, + "grad_norm": 4.130904432799836, + "learning_rate": 7.769935520173155e-07, + "loss": 0.3402, + "step": 14790 + }, + { + "epoch": 2.52, + "grad_norm": 4.298938169660414, + "learning_rate": 7.743478071119459e-07, + "loss": 0.3269, + "step": 14795 + }, + { + "epoch": 2.52, + "grad_norm": 4.9781642622490025, + "learning_rate": 7.717061962760947e-07, + "loss": 0.3334, + "step": 14800 + }, + { + "epoch": 2.52, + "grad_norm": 4.914812405158503, + "learning_rate": 7.690687220941162e-07, + "loss": 0.3292, + "step": 14805 + }, + { + "epoch": 2.52, + "grad_norm": 4.047822614780733, + "learning_rate": 7.664353871463264e-07, + "loss": 0.3182, + "step": 14810 + }, + { + "epoch": 2.52, + "grad_norm": 7.91907198905064, + "learning_rate": 7.638061940089875e-07, + "loss": 0.3363, + "step": 14815 + }, + { + "epoch": 2.52, + "grad_norm": 5.3497987908934865, + "learning_rate": 7.611811452543072e-07, + "loss": 0.331, + "step": 14820 + }, + { + "epoch": 2.52, + "grad_norm": 4.267298014785521, + "learning_rate": 7.585602434504453e-07, + "loss": 0.3287, + "step": 14825 + }, + { + "epoch": 2.52, + "grad_norm": 4.802029820505902, + "learning_rate": 7.559434911614977e-07, + "loss": 0.3252, + "step": 14830 + }, + { + "epoch": 2.52, + "grad_norm": 4.32890062588132, + "learning_rate": 7.533308909475068e-07, + "loss": 0.3271, + "step": 14835 + }, + { + "epoch": 2.52, + "grad_norm": 4.268375713285703, + "learning_rate": 7.507224453644474e-07, + "loss": 0.3405, + "step": 14840 + }, + { + "epoch": 2.52, + "grad_norm": 4.399286170087374, + "learning_rate": 7.481181569642332e-07, + "loss": 0.3243, + "step": 14845 + }, + { + "epoch": 2.52, + "grad_norm": 4.304404015063495, + "learning_rate": 7.455180282947083e-07, + "loss": 0.3337, + "step": 14850 + }, + { + "epoch": 2.53, + "grad_norm": 5.029073052501918, + "learning_rate": 7.429220618996507e-07, + "loss": 0.3262, + "step": 14855 + }, + { + "epoch": 2.53, + "grad_norm": 6.378391554889924, + "learning_rate": 7.40330260318764e-07, + "loss": 0.3384, + "step": 14860 + }, + { + "epoch": 2.53, + "grad_norm": 4.621059063167957, + "learning_rate": 7.377426260876757e-07, + "loss": 0.3335, + "step": 14865 + }, + { + "epoch": 2.53, + "grad_norm": 5.791322359845037, + "learning_rate": 7.351591617379411e-07, + "loss": 0.3324, + "step": 14870 + }, + { + "epoch": 2.53, + "grad_norm": 4.678482618842939, + "learning_rate": 7.325798697970305e-07, + "loss": 0.3365, + "step": 14875 + }, + { + "epoch": 2.53, + "grad_norm": 4.583563067785552, + "learning_rate": 7.300047527883375e-07, + "loss": 0.3338, + "step": 14880 + }, + { + "epoch": 2.53, + "grad_norm": 5.391075538129845, + "learning_rate": 7.274338132311653e-07, + "loss": 0.3414, + "step": 14885 + }, + { + "epoch": 2.53, + "grad_norm": 4.478843806589986, + "learning_rate": 7.248670536407354e-07, + "loss": 0.3129, + "step": 14890 + }, + { + "epoch": 2.53, + "grad_norm": 4.5267513455016415, + "learning_rate": 7.223044765281767e-07, + "loss": 0.3326, + "step": 14895 + }, + { + "epoch": 2.53, + "grad_norm": 5.660762723698685, + "learning_rate": 7.197460844005294e-07, + "loss": 0.3379, + "step": 14900 + }, + { + "epoch": 2.53, + "grad_norm": 8.65434649620914, + "learning_rate": 7.171918797607369e-07, + "loss": 0.3294, + "step": 14905 + }, + { + "epoch": 2.53, + "grad_norm": 7.1632545699544945, + "learning_rate": 7.146418651076443e-07, + "loss": 0.3294, + "step": 14910 + }, + { + "epoch": 2.54, + "grad_norm": 10.121338816522481, + "learning_rate": 7.12096042936003e-07, + "loss": 0.3395, + "step": 14915 + }, + { + "epoch": 2.54, + "grad_norm": 8.581526830547222, + "learning_rate": 7.095544157364575e-07, + "loss": 0.3359, + "step": 14920 + }, + { + "epoch": 2.54, + "grad_norm": 5.044604807753416, + "learning_rate": 7.070169859955506e-07, + "loss": 0.335, + "step": 14925 + }, + { + "epoch": 2.54, + "grad_norm": 7.000469258632066, + "learning_rate": 7.04483756195718e-07, + "loss": 0.3355, + "step": 14930 + }, + { + "epoch": 2.54, + "grad_norm": 6.017536310415441, + "learning_rate": 7.019547288152872e-07, + "loss": 0.3344, + "step": 14935 + }, + { + "epoch": 2.54, + "grad_norm": 7.900616115213088, + "learning_rate": 6.994299063284738e-07, + "loss": 0.3161, + "step": 14940 + }, + { + "epoch": 2.54, + "grad_norm": 5.694735372740905, + "learning_rate": 6.969092912053798e-07, + "loss": 0.3301, + "step": 14945 + }, + { + "epoch": 2.54, + "grad_norm": 4.03599084217576, + "learning_rate": 6.943928859119914e-07, + "loss": 0.3233, + "step": 14950 + }, + { + "epoch": 2.54, + "grad_norm": 7.050873466380064, + "learning_rate": 6.91880692910174e-07, + "loss": 0.3257, + "step": 14955 + }, + { + "epoch": 2.54, + "grad_norm": 4.478973666964457, + "learning_rate": 6.893727146576773e-07, + "loss": 0.3301, + "step": 14960 + }, + { + "epoch": 2.54, + "grad_norm": 5.442444582301273, + "learning_rate": 6.868689536081197e-07, + "loss": 0.3403, + "step": 14965 + }, + { + "epoch": 2.55, + "grad_norm": 4.26838730409746, + "learning_rate": 6.843694122110017e-07, + "loss": 0.3344, + "step": 14970 + }, + { + "epoch": 2.55, + "grad_norm": 6.4244297663026595, + "learning_rate": 6.81874092911689e-07, + "loss": 0.327, + "step": 14975 + }, + { + "epoch": 2.55, + "grad_norm": 6.237606573282461, + "learning_rate": 6.793829981514228e-07, + "loss": 0.3373, + "step": 14980 + }, + { + "epoch": 2.55, + "grad_norm": 6.296940542734641, + "learning_rate": 6.768961303673055e-07, + "loss": 0.318, + "step": 14985 + }, + { + "epoch": 2.55, + "grad_norm": 4.660218223419286, + "learning_rate": 6.744134919923096e-07, + "loss": 0.3253, + "step": 14990 + }, + { + "epoch": 2.55, + "grad_norm": 7.067592424361357, + "learning_rate": 6.719350854552659e-07, + "loss": 0.3245, + "step": 14995 + }, + { + "epoch": 2.55, + "grad_norm": 4.552954424557108, + "learning_rate": 6.694609131808666e-07, + "loss": 0.3287, + "step": 15000 + }, + { + "epoch": 2.55, + "grad_norm": 4.652184851420003, + "learning_rate": 6.669909775896605e-07, + "loss": 0.322, + "step": 15005 + }, + { + "epoch": 2.55, + "grad_norm": 4.855649009523, + "learning_rate": 6.645252810980519e-07, + "loss": 0.3314, + "step": 15010 + }, + { + "epoch": 2.55, + "grad_norm": 6.9922125059213265, + "learning_rate": 6.620638261182998e-07, + "loss": 0.3279, + "step": 15015 + }, + { + "epoch": 2.55, + "grad_norm": 4.399204658913523, + "learning_rate": 6.596066150585107e-07, + "loss": 0.3338, + "step": 15020 + }, + { + "epoch": 2.55, + "grad_norm": 4.694809045991095, + "learning_rate": 6.571536503226411e-07, + "loss": 0.3288, + "step": 15025 + }, + { + "epoch": 2.56, + "grad_norm": 7.276032527676246, + "learning_rate": 6.547049343104916e-07, + "loss": 0.342, + "step": 15030 + }, + { + "epoch": 2.56, + "grad_norm": 5.683686403735051, + "learning_rate": 6.522604694177093e-07, + "loss": 0.3233, + "step": 15035 + }, + { + "epoch": 2.56, + "grad_norm": 9.314664074593344, + "learning_rate": 6.498202580357788e-07, + "loss": 0.3339, + "step": 15040 + }, + { + "epoch": 2.56, + "grad_norm": 7.271181418939168, + "learning_rate": 6.473843025520243e-07, + "loss": 0.3413, + "step": 15045 + }, + { + "epoch": 2.56, + "grad_norm": 9.026745251495655, + "learning_rate": 6.449526053496069e-07, + "loss": 0.3275, + "step": 15050 + }, + { + "epoch": 2.56, + "grad_norm": 4.973143978132485, + "learning_rate": 6.425251688075212e-07, + "loss": 0.3409, + "step": 15055 + }, + { + "epoch": 2.56, + "grad_norm": 4.734733913002473, + "learning_rate": 6.401019953005949e-07, + "loss": 0.3299, + "step": 15060 + }, + { + "epoch": 2.56, + "grad_norm": 4.264578374363782, + "learning_rate": 6.376830871994827e-07, + "loss": 0.3257, + "step": 15065 + }, + { + "epoch": 2.56, + "grad_norm": 6.804642974562434, + "learning_rate": 6.352684468706699e-07, + "loss": 0.3347, + "step": 15070 + }, + { + "epoch": 2.56, + "grad_norm": 4.75835234659942, + "learning_rate": 6.328580766764613e-07, + "loss": 0.3276, + "step": 15075 + }, + { + "epoch": 2.56, + "grad_norm": 4.707073653371905, + "learning_rate": 6.304519789749907e-07, + "loss": 0.3185, + "step": 15080 + }, + { + "epoch": 2.56, + "grad_norm": 4.670681759892701, + "learning_rate": 6.28050156120204e-07, + "loss": 0.3323, + "step": 15085 + }, + { + "epoch": 2.57, + "grad_norm": 4.666206453580839, + "learning_rate": 6.256526104618732e-07, + "loss": 0.338, + "step": 15090 + }, + { + "epoch": 2.57, + "grad_norm": 4.61513244052899, + "learning_rate": 6.232593443455797e-07, + "loss": 0.3252, + "step": 15095 + }, + { + "epoch": 2.57, + "grad_norm": 5.2786063959854825, + "learning_rate": 6.208703601127198e-07, + "loss": 0.3361, + "step": 15100 + }, + { + "epoch": 2.57, + "grad_norm": 4.458489925474581, + "learning_rate": 6.184856601005035e-07, + "loss": 0.3347, + "step": 15105 + }, + { + "epoch": 2.57, + "grad_norm": 7.942362062095153, + "learning_rate": 6.161052466419449e-07, + "loss": 0.3317, + "step": 15110 + }, + { + "epoch": 2.57, + "grad_norm": 7.201628485037839, + "learning_rate": 6.137291220658687e-07, + "loss": 0.3323, + "step": 15115 + }, + { + "epoch": 2.57, + "grad_norm": 6.107208916924716, + "learning_rate": 6.113572886968994e-07, + "loss": 0.3258, + "step": 15120 + }, + { + "epoch": 2.57, + "grad_norm": 5.377864924772755, + "learning_rate": 6.089897488554685e-07, + "loss": 0.3304, + "step": 15125 + }, + { + "epoch": 2.57, + "grad_norm": 4.212508677166943, + "learning_rate": 6.066265048578007e-07, + "loss": 0.3345, + "step": 15130 + }, + { + "epoch": 2.57, + "grad_norm": 4.611635469346788, + "learning_rate": 6.042675590159241e-07, + "loss": 0.3303, + "step": 15135 + }, + { + "epoch": 2.57, + "grad_norm": 4.553090232084375, + "learning_rate": 6.019129136376578e-07, + "loss": 0.3343, + "step": 15140 + }, + { + "epoch": 2.57, + "grad_norm": 5.034051982014956, + "learning_rate": 5.99562571026614e-07, + "loss": 0.3354, + "step": 15145 + }, + { + "epoch": 2.58, + "grad_norm": 4.272555573157402, + "learning_rate": 5.972165334821983e-07, + "loss": 0.3366, + "step": 15150 + }, + { + "epoch": 2.58, + "grad_norm": 6.82950133715513, + "learning_rate": 5.94874803299601e-07, + "loss": 0.3234, + "step": 15155 + }, + { + "epoch": 2.58, + "grad_norm": 6.959497100574585, + "learning_rate": 5.925373827698011e-07, + "loss": 0.3163, + "step": 15160 + }, + { + "epoch": 2.58, + "grad_norm": 5.261785061241256, + "learning_rate": 5.902042741795594e-07, + "loss": 0.3275, + "step": 15165 + }, + { + "epoch": 2.58, + "grad_norm": 5.01198603476133, + "learning_rate": 5.878754798114189e-07, + "loss": 0.3256, + "step": 15170 + }, + { + "epoch": 2.58, + "grad_norm": 5.558766408563095, + "learning_rate": 5.855510019437011e-07, + "loss": 0.327, + "step": 15175 + }, + { + "epoch": 2.58, + "grad_norm": 4.556461907057693, + "learning_rate": 5.83230842850508e-07, + "loss": 0.3342, + "step": 15180 + }, + { + "epoch": 2.58, + "grad_norm": 8.951444850543265, + "learning_rate": 5.809150048017115e-07, + "loss": 0.3355, + "step": 15185 + }, + { + "epoch": 2.58, + "grad_norm": 4.21513520852271, + "learning_rate": 5.786034900629584e-07, + "loss": 0.3265, + "step": 15190 + }, + { + "epoch": 2.58, + "grad_norm": 4.822438591492039, + "learning_rate": 5.762963008956674e-07, + "loss": 0.3196, + "step": 15195 + }, + { + "epoch": 2.58, + "grad_norm": 4.088036776916842, + "learning_rate": 5.739934395570224e-07, + "loss": 0.329, + "step": 15200 + }, + { + "epoch": 2.59, + "grad_norm": 5.32928136338505, + "learning_rate": 5.716949082999773e-07, + "loss": 0.3319, + "step": 15205 + }, + { + "epoch": 2.59, + "grad_norm": 4.406320074819915, + "learning_rate": 5.694007093732434e-07, + "loss": 0.3246, + "step": 15210 + }, + { + "epoch": 2.59, + "grad_norm": 4.283054897769128, + "learning_rate": 5.671108450213009e-07, + "loss": 0.3255, + "step": 15215 + }, + { + "epoch": 2.59, + "grad_norm": 5.293700654067463, + "learning_rate": 5.648253174843826e-07, + "loss": 0.3305, + "step": 15220 + }, + { + "epoch": 2.59, + "grad_norm": 6.174784098397865, + "learning_rate": 5.625441289984851e-07, + "loss": 0.309, + "step": 15225 + }, + { + "epoch": 2.59, + "grad_norm": 4.554691783757481, + "learning_rate": 5.602672817953547e-07, + "loss": 0.3309, + "step": 15230 + }, + { + "epoch": 2.59, + "grad_norm": 7.632898096662614, + "learning_rate": 5.579947781024919e-07, + "loss": 0.326, + "step": 15235 + }, + { + "epoch": 2.59, + "grad_norm": 6.394787708139916, + "learning_rate": 5.5572662014315e-07, + "loss": 0.3218, + "step": 15240 + }, + { + "epoch": 2.59, + "grad_norm": 8.185992925087373, + "learning_rate": 5.534628101363287e-07, + "loss": 0.3383, + "step": 15245 + }, + { + "epoch": 2.59, + "grad_norm": 4.80027617699584, + "learning_rate": 5.51203350296774e-07, + "loss": 0.3289, + "step": 15250 + }, + { + "epoch": 2.59, + "grad_norm": 5.719033975776043, + "learning_rate": 5.489482428349751e-07, + "loss": 0.3349, + "step": 15255 + }, + { + "epoch": 2.59, + "grad_norm": 4.394999674061311, + "learning_rate": 5.46697489957167e-07, + "loss": 0.3406, + "step": 15260 + }, + { + "epoch": 2.6, + "grad_norm": 5.365908251012121, + "learning_rate": 5.444510938653191e-07, + "loss": 0.3295, + "step": 15265 + }, + { + "epoch": 2.6, + "grad_norm": 4.531697620960981, + "learning_rate": 5.422090567571448e-07, + "loss": 0.3352, + "step": 15270 + }, + { + "epoch": 2.6, + "grad_norm": 4.818715998208998, + "learning_rate": 5.399713808260871e-07, + "loss": 0.332, + "step": 15275 + }, + { + "epoch": 2.6, + "grad_norm": 4.722096102165326, + "learning_rate": 5.377380682613243e-07, + "loss": 0.3303, + "step": 15280 + }, + { + "epoch": 2.6, + "grad_norm": 4.677021215940097, + "learning_rate": 5.355091212477693e-07, + "loss": 0.3251, + "step": 15285 + }, + { + "epoch": 2.6, + "grad_norm": 5.33303885654041, + "learning_rate": 5.33284541966057e-07, + "loss": 0.3324, + "step": 15290 + }, + { + "epoch": 2.6, + "grad_norm": 5.959350957878013, + "learning_rate": 5.310643325925563e-07, + "loss": 0.3139, + "step": 15295 + }, + { + "epoch": 2.6, + "grad_norm": 7.711470113011544, + "learning_rate": 5.288484952993556e-07, + "loss": 0.3306, + "step": 15300 + }, + { + "epoch": 2.6, + "grad_norm": 4.881691636537076, + "learning_rate": 5.266370322542713e-07, + "loss": 0.3299, + "step": 15305 + }, + { + "epoch": 2.6, + "grad_norm": 4.341716160517968, + "learning_rate": 5.244299456208341e-07, + "loss": 0.3282, + "step": 15310 + }, + { + "epoch": 2.6, + "grad_norm": 4.6246624806682375, + "learning_rate": 5.222272375582993e-07, + "loss": 0.3269, + "step": 15315 + }, + { + "epoch": 2.6, + "grad_norm": 4.538722876182721, + "learning_rate": 5.200289102216338e-07, + "loss": 0.3291, + "step": 15320 + }, + { + "epoch": 2.61, + "grad_norm": 4.1239510295431305, + "learning_rate": 5.178349657615217e-07, + "loss": 0.3253, + "step": 15325 + }, + { + "epoch": 2.61, + "grad_norm": 4.21239231566736, + "learning_rate": 5.156454063243566e-07, + "loss": 0.3269, + "step": 15330 + }, + { + "epoch": 2.61, + "grad_norm": 6.697072352286349, + "learning_rate": 5.134602340522437e-07, + "loss": 0.328, + "step": 15335 + }, + { + "epoch": 2.61, + "grad_norm": 6.146424287472719, + "learning_rate": 5.112794510829977e-07, + "loss": 0.3259, + "step": 15340 + }, + { + "epoch": 2.61, + "grad_norm": 4.001634187370734, + "learning_rate": 5.091030595501351e-07, + "loss": 0.3314, + "step": 15345 + }, + { + "epoch": 2.61, + "grad_norm": 4.675020804818011, + "learning_rate": 5.069310615828804e-07, + "loss": 0.3298, + "step": 15350 + }, + { + "epoch": 2.61, + "grad_norm": 5.884773853738382, + "learning_rate": 5.047634593061562e-07, + "loss": 0.341, + "step": 15355 + }, + { + "epoch": 2.61, + "grad_norm": 4.686820504059619, + "learning_rate": 5.026002548405878e-07, + "loss": 0.3222, + "step": 15360 + }, + { + "epoch": 2.61, + "grad_norm": 6.570517269507593, + "learning_rate": 5.004414503024962e-07, + "loss": 0.3336, + "step": 15365 + }, + { + "epoch": 2.61, + "grad_norm": 4.382963602034246, + "learning_rate": 4.982870478038976e-07, + "loss": 0.3315, + "step": 15370 + }, + { + "epoch": 2.61, + "grad_norm": 4.8333082592597085, + "learning_rate": 4.96137049452502e-07, + "loss": 0.3108, + "step": 15375 + }, + { + "epoch": 2.61, + "grad_norm": 4.703537645888523, + "learning_rate": 4.939914573517097e-07, + "loss": 0.33, + "step": 15380 + }, + { + "epoch": 2.62, + "grad_norm": 4.055508354992322, + "learning_rate": 4.918502736006136e-07, + "loss": 0.3272, + "step": 15385 + }, + { + "epoch": 2.62, + "grad_norm": 5.140679591165281, + "learning_rate": 4.897135002939896e-07, + "loss": 0.316, + "step": 15390 + }, + { + "epoch": 2.62, + "grad_norm": 4.6297789890950165, + "learning_rate": 4.875811395223023e-07, + "loss": 0.317, + "step": 15395 + }, + { + "epoch": 2.62, + "grad_norm": 4.887885997760296, + "learning_rate": 4.85453193371696e-07, + "loss": 0.33, + "step": 15400 + }, + { + "epoch": 2.62, + "grad_norm": 5.376128690566044, + "learning_rate": 4.83329663924001e-07, + "loss": 0.3245, + "step": 15405 + }, + { + "epoch": 2.62, + "grad_norm": 4.8938060055908865, + "learning_rate": 4.812105532567191e-07, + "loss": 0.3427, + "step": 15410 + }, + { + "epoch": 2.62, + "grad_norm": 5.11132863259498, + "learning_rate": 4.790958634430365e-07, + "loss": 0.3317, + "step": 15415 + }, + { + "epoch": 2.62, + "grad_norm": 4.724591336463593, + "learning_rate": 4.7698559655181e-07, + "loss": 0.3357, + "step": 15420 + }, + { + "epoch": 2.62, + "grad_norm": 4.951037262704773, + "learning_rate": 4.748797546475703e-07, + "loss": 0.3268, + "step": 15425 + }, + { + "epoch": 2.62, + "grad_norm": 4.8802443891193565, + "learning_rate": 4.727783397905211e-07, + "loss": 0.3331, + "step": 15430 + }, + { + "epoch": 2.62, + "grad_norm": 4.373028538851771, + "learning_rate": 4.706813540365313e-07, + "loss": 0.3267, + "step": 15435 + }, + { + "epoch": 2.62, + "grad_norm": 6.122718060013959, + "learning_rate": 4.6858879943713965e-07, + "loss": 0.3286, + "step": 15440 + }, + { + "epoch": 2.63, + "grad_norm": 5.987546920660024, + "learning_rate": 4.665006780395492e-07, + "loss": 0.3243, + "step": 15445 + }, + { + "epoch": 2.63, + "grad_norm": 5.851157448444356, + "learning_rate": 4.6441699188662424e-07, + "loss": 0.3309, + "step": 15450 + }, + { + "epoch": 2.63, + "grad_norm": 4.222443952757895, + "learning_rate": 4.623377430168913e-07, + "loss": 0.325, + "step": 15455 + }, + { + "epoch": 2.63, + "grad_norm": 3.9519127965546392, + "learning_rate": 4.6026293346453644e-07, + "loss": 0.3299, + "step": 15460 + }, + { + "epoch": 2.63, + "grad_norm": 4.281245218736686, + "learning_rate": 4.581925652594016e-07, + "loss": 0.3293, + "step": 15465 + }, + { + "epoch": 2.63, + "grad_norm": 5.425900579865196, + "learning_rate": 4.561266404269826e-07, + "loss": 0.3161, + "step": 15470 + }, + { + "epoch": 2.63, + "grad_norm": 4.723935686904716, + "learning_rate": 4.5406516098843166e-07, + "loss": 0.326, + "step": 15475 + }, + { + "epoch": 2.63, + "grad_norm": 4.551245390291361, + "learning_rate": 4.5200812896054714e-07, + "loss": 0.3285, + "step": 15480 + }, + { + "epoch": 2.63, + "grad_norm": 4.385884864216109, + "learning_rate": 4.49955546355782e-07, + "loss": 0.3308, + "step": 15485 + }, + { + "epoch": 2.63, + "grad_norm": 5.182454178384056, + "learning_rate": 4.479074151822299e-07, + "loss": 0.337, + "step": 15490 + }, + { + "epoch": 2.63, + "grad_norm": 4.427294600601199, + "learning_rate": 4.458637374436353e-07, + "loss": 0.3282, + "step": 15495 + }, + { + "epoch": 2.64, + "grad_norm": 4.293405692501016, + "learning_rate": 4.4382451513938163e-07, + "loss": 0.3333, + "step": 15500 + }, + { + "epoch": 2.64, + "grad_norm": 4.942125262624205, + "learning_rate": 4.4178975026449634e-07, + "loss": 0.3322, + "step": 15505 + }, + { + "epoch": 2.64, + "grad_norm": 4.20532593695376, + "learning_rate": 4.397594448096448e-07, + "loss": 0.3221, + "step": 15510 + }, + { + "epoch": 2.64, + "grad_norm": 4.226395874757872, + "learning_rate": 4.377336007611277e-07, + "loss": 0.3227, + "step": 15515 + }, + { + "epoch": 2.64, + "grad_norm": 4.300060759903124, + "learning_rate": 4.357122201008851e-07, + "loss": 0.3342, + "step": 15520 + }, + { + "epoch": 2.64, + "grad_norm": 4.533745505806048, + "learning_rate": 4.3369530480648737e-07, + "loss": 0.332, + "step": 15525 + }, + { + "epoch": 2.64, + "grad_norm": 5.335895542748602, + "learning_rate": 4.316828568511372e-07, + "loss": 0.3218, + "step": 15530 + }, + { + "epoch": 2.64, + "grad_norm": 5.288348245563574, + "learning_rate": 4.296748782036658e-07, + "loss": 0.3345, + "step": 15535 + }, + { + "epoch": 2.64, + "grad_norm": 5.385575084440829, + "learning_rate": 4.276713708285346e-07, + "loss": 0.3355, + "step": 15540 + }, + { + "epoch": 2.64, + "grad_norm": 5.16265164571498, + "learning_rate": 4.256723366858267e-07, + "loss": 0.3326, + "step": 15545 + }, + { + "epoch": 2.64, + "grad_norm": 4.65564953851326, + "learning_rate": 4.236777777312534e-07, + "loss": 0.3206, + "step": 15550 + }, + { + "epoch": 2.64, + "grad_norm": 4.706592827325074, + "learning_rate": 4.2168769591614476e-07, + "loss": 0.3363, + "step": 15555 + }, + { + "epoch": 2.65, + "grad_norm": 4.220505773528335, + "learning_rate": 4.197020931874507e-07, + "loss": 0.3274, + "step": 15560 + }, + { + "epoch": 2.65, + "grad_norm": 4.213911131499332, + "learning_rate": 4.1772097148774173e-07, + "loss": 0.3255, + "step": 15565 + }, + { + "epoch": 2.65, + "grad_norm": 4.282918448441257, + "learning_rate": 4.1574433275519963e-07, + "loss": 0.3278, + "step": 15570 + }, + { + "epoch": 2.65, + "grad_norm": 4.404275303777677, + "learning_rate": 4.1377217892362653e-07, + "loss": 0.3296, + "step": 15575 + }, + { + "epoch": 2.65, + "grad_norm": 5.876173248789661, + "learning_rate": 4.118045119224312e-07, + "loss": 0.3306, + "step": 15580 + }, + { + "epoch": 2.65, + "grad_norm": 5.255389054786569, + "learning_rate": 4.0984133367663717e-07, + "loss": 0.3213, + "step": 15585 + }, + { + "epoch": 2.65, + "grad_norm": 5.024576780909882, + "learning_rate": 4.07882646106873e-07, + "loss": 0.3324, + "step": 15590 + }, + { + "epoch": 2.65, + "grad_norm": 4.443562631028897, + "learning_rate": 4.0592845112937764e-07, + "loss": 0.3287, + "step": 15595 + }, + { + "epoch": 2.65, + "grad_norm": 5.128444756291535, + "learning_rate": 4.0397875065599225e-07, + "loss": 0.3266, + "step": 15600 + }, + { + "epoch": 2.65, + "grad_norm": 4.473835014555687, + "learning_rate": 4.0203354659415995e-07, + "loss": 0.3269, + "step": 15605 + }, + { + "epoch": 2.65, + "grad_norm": 5.302270282731905, + "learning_rate": 4.0009284084692734e-07, + "loss": 0.3195, + "step": 15610 + }, + { + "epoch": 2.65, + "grad_norm": 4.244008416918633, + "learning_rate": 3.981566353129385e-07, + "loss": 0.3298, + "step": 15615 + }, + { + "epoch": 2.66, + "grad_norm": 4.511728239678076, + "learning_rate": 3.9622493188643695e-07, + "loss": 0.3235, + "step": 15620 + }, + { + "epoch": 2.66, + "grad_norm": 4.7468068473137235, + "learning_rate": 3.9429773245725836e-07, + "loss": 0.33, + "step": 15625 + }, + { + "epoch": 2.66, + "grad_norm": 4.532017615851619, + "learning_rate": 3.9237503891083604e-07, + "loss": 0.3328, + "step": 15630 + }, + { + "epoch": 2.66, + "grad_norm": 3.9984381654002403, + "learning_rate": 3.904568531281905e-07, + "loss": 0.3259, + "step": 15635 + }, + { + "epoch": 2.66, + "grad_norm": 4.369737981053874, + "learning_rate": 3.8854317698593713e-07, + "loss": 0.3219, + "step": 15640 + }, + { + "epoch": 2.66, + "grad_norm": 4.891949730049542, + "learning_rate": 3.866340123562756e-07, + "loss": 0.3275, + "step": 15645 + }, + { + "epoch": 2.66, + "grad_norm": 4.777212770846676, + "learning_rate": 3.8472936110699354e-07, + "loss": 0.3311, + "step": 15650 + }, + { + "epoch": 2.66, + "grad_norm": 4.511407488239449, + "learning_rate": 3.8282922510146257e-07, + "loss": 0.3275, + "step": 15655 + }, + { + "epoch": 2.66, + "grad_norm": 4.363462161440987, + "learning_rate": 3.8093360619863575e-07, + "loss": 0.3244, + "step": 15660 + }, + { + "epoch": 2.66, + "grad_norm": 4.1267355991414805, + "learning_rate": 3.7904250625305006e-07, + "loss": 0.3266, + "step": 15665 + }, + { + "epoch": 2.66, + "grad_norm": 4.521972951609363, + "learning_rate": 3.771559271148184e-07, + "loss": 0.333, + "step": 15670 + }, + { + "epoch": 2.66, + "grad_norm": 4.264437267832434, + "learning_rate": 3.7527387062963274e-07, + "loss": 0.3327, + "step": 15675 + }, + { + "epoch": 2.67, + "grad_norm": 6.003377017199365, + "learning_rate": 3.7339633863875956e-07, + "loss": 0.3262, + "step": 15680 + }, + { + "epoch": 2.67, + "grad_norm": 6.080670004347522, + "learning_rate": 3.715233329790391e-07, + "loss": 0.3284, + "step": 15685 + }, + { + "epoch": 2.67, + "grad_norm": 4.331208989720204, + "learning_rate": 3.6965485548288217e-07, + "loss": 0.3187, + "step": 15690 + }, + { + "epoch": 2.67, + "grad_norm": 4.463432793538451, + "learning_rate": 3.677909079782721e-07, + "loss": 0.3214, + "step": 15695 + }, + { + "epoch": 2.67, + "grad_norm": 4.765134561944942, + "learning_rate": 3.6593149228875915e-07, + "loss": 0.3294, + "step": 15700 + }, + { + "epoch": 2.67, + "grad_norm": 5.288035996501678, + "learning_rate": 3.640766102334581e-07, + "loss": 0.3286, + "step": 15705 + }, + { + "epoch": 2.67, + "grad_norm": 6.418751421483361, + "learning_rate": 3.622262636270518e-07, + "loss": 0.3216, + "step": 15710 + }, + { + "epoch": 2.67, + "grad_norm": 4.867108747967822, + "learning_rate": 3.603804542797829e-07, + "loss": 0.3272, + "step": 15715 + }, + { + "epoch": 2.67, + "grad_norm": 5.276748456421316, + "learning_rate": 3.5853918399745836e-07, + "loss": 0.3212, + "step": 15720 + }, + { + "epoch": 2.67, + "grad_norm": 4.2089463813160854, + "learning_rate": 3.567024545814413e-07, + "loss": 0.3262, + "step": 15725 + }, + { + "epoch": 2.67, + "grad_norm": 5.083727820870298, + "learning_rate": 3.548702678286542e-07, + "loss": 0.3294, + "step": 15730 + }, + { + "epoch": 2.68, + "grad_norm": 4.290004487380496, + "learning_rate": 3.5304262553157277e-07, + "loss": 0.3369, + "step": 15735 + }, + { + "epoch": 2.68, + "grad_norm": 6.074764534438417, + "learning_rate": 3.5121952947823166e-07, + "loss": 0.3298, + "step": 15740 + }, + { + "epoch": 2.68, + "grad_norm": 4.5570105710259305, + "learning_rate": 3.494009814522137e-07, + "loss": 0.3286, + "step": 15745 + }, + { + "epoch": 2.68, + "grad_norm": 6.788132016382686, + "learning_rate": 3.475869832326523e-07, + "loss": 0.3233, + "step": 15750 + }, + { + "epoch": 2.68, + "grad_norm": 4.219825053242776, + "learning_rate": 3.4577753659423287e-07, + "loss": 0.3308, + "step": 15755 + }, + { + "epoch": 2.68, + "grad_norm": 4.414709813806513, + "learning_rate": 3.4397264330718437e-07, + "loss": 0.3344, + "step": 15760 + }, + { + "epoch": 2.68, + "grad_norm": 4.2904459457604265, + "learning_rate": 3.421723051372844e-07, + "loss": 0.3254, + "step": 15765 + }, + { + "epoch": 2.68, + "grad_norm": 4.503347436179465, + "learning_rate": 3.403765238458495e-07, + "loss": 0.3218, + "step": 15770 + }, + { + "epoch": 2.68, + "grad_norm": 5.221171414137864, + "learning_rate": 3.385853011897433e-07, + "loss": 0.3319, + "step": 15775 + }, + { + "epoch": 2.68, + "grad_norm": 5.93086889960561, + "learning_rate": 3.367986389213662e-07, + "loss": 0.327, + "step": 15780 + }, + { + "epoch": 2.68, + "grad_norm": 4.106430303910091, + "learning_rate": 3.350165387886584e-07, + "loss": 0.3215, + "step": 15785 + }, + { + "epoch": 2.68, + "grad_norm": 5.28094196659452, + "learning_rate": 3.3323900253509736e-07, + "loss": 0.3275, + "step": 15790 + }, + { + "epoch": 2.69, + "grad_norm": 4.613510431621033, + "learning_rate": 3.314660318996921e-07, + "loss": 0.3171, + "step": 15795 + }, + { + "epoch": 2.69, + "grad_norm": 5.309412604149584, + "learning_rate": 3.296976286169906e-07, + "loss": 0.3258, + "step": 15800 + }, + { + "epoch": 2.69, + "grad_norm": 4.364148651082251, + "learning_rate": 3.2793379441706854e-07, + "loss": 0.333, + "step": 15805 + }, + { + "epoch": 2.69, + "grad_norm": 9.300943879556085, + "learning_rate": 3.261745310255321e-07, + "loss": 0.3151, + "step": 15810 + }, + { + "epoch": 2.69, + "grad_norm": 7.040209421392553, + "learning_rate": 3.244198401635157e-07, + "loss": 0.3209, + "step": 15815 + }, + { + "epoch": 2.69, + "grad_norm": 4.794991959613715, + "learning_rate": 3.226697235476817e-07, + "loss": 0.306, + "step": 15820 + }, + { + "epoch": 2.69, + "grad_norm": 4.935248036664671, + "learning_rate": 3.2092418289021487e-07, + "loss": 0.3253, + "step": 15825 + }, + { + "epoch": 2.69, + "grad_norm": 6.056519209770571, + "learning_rate": 3.1918321989882706e-07, + "loss": 0.318, + "step": 15830 + }, + { + "epoch": 2.69, + "grad_norm": 5.269083367129021, + "learning_rate": 3.17446836276748e-07, + "loss": 0.3314, + "step": 15835 + }, + { + "epoch": 2.69, + "grad_norm": 4.326862720235322, + "learning_rate": 3.157150337227277e-07, + "loss": 0.3167, + "step": 15840 + }, + { + "epoch": 2.69, + "grad_norm": 4.069723672961724, + "learning_rate": 3.1398781393103763e-07, + "loss": 0.3294, + "step": 15845 + }, + { + "epoch": 2.69, + "grad_norm": 5.8522878283056725, + "learning_rate": 3.1226517859146157e-07, + "loss": 0.3302, + "step": 15850 + }, + { + "epoch": 2.7, + "grad_norm": 4.231814147547882, + "learning_rate": 3.105471293893009e-07, + "loss": 0.3266, + "step": 15855 + }, + { + "epoch": 2.7, + "grad_norm": 4.909474166584724, + "learning_rate": 3.088336680053688e-07, + "loss": 0.3243, + "step": 15860 + }, + { + "epoch": 2.7, + "grad_norm": 5.235909628911115, + "learning_rate": 3.071247961159918e-07, + "loss": 0.324, + "step": 15865 + }, + { + "epoch": 2.7, + "grad_norm": 4.734972101490973, + "learning_rate": 3.0542051539300455e-07, + "loss": 0.3275, + "step": 15870 + }, + { + "epoch": 2.7, + "grad_norm": 3.9506257702838843, + "learning_rate": 3.037208275037512e-07, + "loss": 0.3251, + "step": 15875 + }, + { + "epoch": 2.7, + "grad_norm": 4.538223913166323, + "learning_rate": 3.020257341110827e-07, + "loss": 0.3264, + "step": 15880 + }, + { + "epoch": 2.7, + "grad_norm": 4.358203367211378, + "learning_rate": 3.0033523687335363e-07, + "loss": 0.3279, + "step": 15885 + }, + { + "epoch": 2.7, + "grad_norm": 4.231501878147074, + "learning_rate": 2.986493374444244e-07, + "loss": 0.3239, + "step": 15890 + }, + { + "epoch": 2.7, + "grad_norm": 4.837316439950837, + "learning_rate": 2.969680374736539e-07, + "loss": 0.3238, + "step": 15895 + }, + { + "epoch": 2.7, + "grad_norm": 4.018421603392979, + "learning_rate": 2.952913386059053e-07, + "loss": 0.318, + "step": 15900 + }, + { + "epoch": 2.7, + "grad_norm": 4.1435525006903475, + "learning_rate": 2.936192424815365e-07, + "loss": 0.3175, + "step": 15905 + }, + { + "epoch": 2.7, + "grad_norm": 4.12970076162308, + "learning_rate": 2.919517507364056e-07, + "loss": 0.3218, + "step": 15910 + }, + { + "epoch": 2.71, + "grad_norm": 4.27748140244397, + "learning_rate": 2.902888650018648e-07, + "loss": 0.3274, + "step": 15915 + }, + { + "epoch": 2.71, + "grad_norm": 4.885317669031329, + "learning_rate": 2.886305869047584e-07, + "loss": 0.3287, + "step": 15920 + }, + { + "epoch": 2.71, + "grad_norm": 4.28260729481659, + "learning_rate": 2.86976918067427e-07, + "loss": 0.3251, + "step": 15925 + }, + { + "epoch": 2.71, + "grad_norm": 5.475073429347808, + "learning_rate": 2.8532786010769753e-07, + "loss": 0.3391, + "step": 15930 + }, + { + "epoch": 2.71, + "grad_norm": 4.821053704563675, + "learning_rate": 2.8368341463888895e-07, + "loss": 0.3192, + "step": 15935 + }, + { + "epoch": 2.71, + "grad_norm": 5.130502760793864, + "learning_rate": 2.8204358326980543e-07, + "loss": 0.3255, + "step": 15940 + }, + { + "epoch": 2.71, + "grad_norm": 4.893758489623744, + "learning_rate": 2.8040836760473977e-07, + "loss": 0.3357, + "step": 15945 + }, + { + "epoch": 2.71, + "grad_norm": 4.2516531067849845, + "learning_rate": 2.7877776924346625e-07, + "loss": 0.3132, + "step": 15950 + }, + { + "epoch": 2.71, + "grad_norm": 7.406973652341834, + "learning_rate": 2.771517897812437e-07, + "loss": 0.3293, + "step": 15955 + }, + { + "epoch": 2.71, + "grad_norm": 5.322483066999841, + "learning_rate": 2.755304308088125e-07, + "loss": 0.3166, + "step": 15960 + }, + { + "epoch": 2.71, + "grad_norm": 5.474306932264857, + "learning_rate": 2.7391369391239043e-07, + "loss": 0.3288, + "step": 15965 + }, + { + "epoch": 2.72, + "grad_norm": 4.20725319692026, + "learning_rate": 2.723015806736756e-07, + "loss": 0.3171, + "step": 15970 + }, + { + "epoch": 2.72, + "grad_norm": 4.535491629522536, + "learning_rate": 2.7069409266984204e-07, + "loss": 0.3264, + "step": 15975 + }, + { + "epoch": 2.72, + "grad_norm": 5.761847179835859, + "learning_rate": 2.690912314735383e-07, + "loss": 0.3269, + "step": 15980 + }, + { + "epoch": 2.72, + "grad_norm": 5.109385173462457, + "learning_rate": 2.6749299865288626e-07, + "loss": 0.3224, + "step": 15985 + }, + { + "epoch": 2.72, + "grad_norm": 4.879185729585172, + "learning_rate": 2.6589939577148115e-07, + "loss": 0.3297, + "step": 15990 + }, + { + "epoch": 2.72, + "grad_norm": 4.056409679988566, + "learning_rate": 2.6431042438838707e-07, + "loss": 0.322, + "step": 15995 + }, + { + "epoch": 2.72, + "grad_norm": 5.058301298589732, + "learning_rate": 2.6272608605813766e-07, + "loss": 0.3372, + "step": 16000 + }, + { + "epoch": 2.72, + "grad_norm": 4.0829892398529095, + "learning_rate": 2.611463823307342e-07, + "loss": 0.3256, + "step": 16005 + }, + { + "epoch": 2.72, + "grad_norm": 4.781469546114506, + "learning_rate": 2.595713147516432e-07, + "loss": 0.3281, + "step": 16010 + }, + { + "epoch": 2.72, + "grad_norm": 5.154008309359028, + "learning_rate": 2.5800088486179545e-07, + "loss": 0.326, + "step": 16015 + }, + { + "epoch": 2.72, + "grad_norm": 4.4988103210311365, + "learning_rate": 2.564350941975852e-07, + "loss": 0.3319, + "step": 16020 + }, + { + "epoch": 2.72, + "grad_norm": 4.230648636823393, + "learning_rate": 2.5487394429086764e-07, + "loss": 0.328, + "step": 16025 + }, + { + "epoch": 2.73, + "grad_norm": 5.220354766605433, + "learning_rate": 2.5331743666895725e-07, + "loss": 0.3251, + "step": 16030 + }, + { + "epoch": 2.73, + "grad_norm": 4.181949627869047, + "learning_rate": 2.517655728546287e-07, + "loss": 0.3202, + "step": 16035 + }, + { + "epoch": 2.73, + "grad_norm": 4.18793309691398, + "learning_rate": 2.5021835436611076e-07, + "loss": 0.3219, + "step": 16040 + }, + { + "epoch": 2.73, + "grad_norm": 3.7932237688490047, + "learning_rate": 2.486757827170905e-07, + "loss": 0.3205, + "step": 16045 + }, + { + "epoch": 2.73, + "grad_norm": 4.637007418320029, + "learning_rate": 2.471378594167062e-07, + "loss": 0.3255, + "step": 16050 + }, + { + "epoch": 2.73, + "grad_norm": 4.274747694027581, + "learning_rate": 2.456045859695505e-07, + "loss": 0.318, + "step": 16055 + }, + { + "epoch": 2.73, + "grad_norm": 3.888514536811012, + "learning_rate": 2.440759638756651e-07, + "loss": 0.3147, + "step": 16060 + }, + { + "epoch": 2.73, + "grad_norm": 4.070119120643892, + "learning_rate": 2.425519946305438e-07, + "loss": 0.3255, + "step": 16065 + }, + { + "epoch": 2.73, + "grad_norm": 4.318133851055312, + "learning_rate": 2.4103267972512554e-07, + "loss": 0.3202, + "step": 16070 + }, + { + "epoch": 2.73, + "grad_norm": 4.812137184792789, + "learning_rate": 2.39518020645797e-07, + "loss": 0.3237, + "step": 16075 + }, + { + "epoch": 2.73, + "grad_norm": 4.288701026818577, + "learning_rate": 2.3800801887439106e-07, + "loss": 0.3229, + "step": 16080 + }, + { + "epoch": 2.73, + "grad_norm": 5.333033708892183, + "learning_rate": 2.365026758881822e-07, + "loss": 0.3267, + "step": 16085 + }, + { + "epoch": 2.74, + "grad_norm": 4.423311250484052, + "learning_rate": 2.350019931598896e-07, + "loss": 0.3196, + "step": 16090 + }, + { + "epoch": 2.74, + "grad_norm": 4.8143452561896725, + "learning_rate": 2.3350597215766878e-07, + "loss": 0.3212, + "step": 16095 + }, + { + "epoch": 2.74, + "grad_norm": 4.9134689439462935, + "learning_rate": 2.3201461434512075e-07, + "loss": 0.3289, + "step": 16100 + }, + { + "epoch": 2.74, + "grad_norm": 4.438047747163149, + "learning_rate": 2.305279211812783e-07, + "loss": 0.3161, + "step": 16105 + }, + { + "epoch": 2.74, + "grad_norm": 4.079736872305593, + "learning_rate": 2.2904589412061528e-07, + "loss": 0.3229, + "step": 16110 + }, + { + "epoch": 2.74, + "grad_norm": 4.72236932999239, + "learning_rate": 2.2756853461303797e-07, + "loss": 0.33, + "step": 16115 + }, + { + "epoch": 2.74, + "grad_norm": 6.102576174544622, + "learning_rate": 2.2609584410388685e-07, + "loss": 0.3234, + "step": 16120 + }, + { + "epoch": 2.74, + "grad_norm": 6.018929984105898, + "learning_rate": 2.2462782403393557e-07, + "loss": 0.3098, + "step": 16125 + }, + { + "epoch": 2.74, + "grad_norm": 4.505430379967904, + "learning_rate": 2.2316447583938694e-07, + "loss": 0.3351, + "step": 16130 + }, + { + "epoch": 2.74, + "grad_norm": 4.084487341521747, + "learning_rate": 2.2170580095187466e-07, + "loss": 0.3242, + "step": 16135 + }, + { + "epoch": 2.74, + "grad_norm": 6.836398225817673, + "learning_rate": 2.2025180079845886e-07, + "loss": 0.3267, + "step": 16140 + }, + { + "epoch": 2.74, + "grad_norm": 4.944041701120263, + "learning_rate": 2.1880247680162836e-07, + "loss": 0.331, + "step": 16145 + }, + { + "epoch": 2.75, + "grad_norm": 4.132936779080271, + "learning_rate": 2.1735783037929391e-07, + "loss": 0.3283, + "step": 16150 + }, + { + "epoch": 2.75, + "grad_norm": 6.9944360578145135, + "learning_rate": 2.1591786294479444e-07, + "loss": 0.3258, + "step": 16155 + }, + { + "epoch": 2.75, + "grad_norm": 4.352649165025649, + "learning_rate": 2.1448257590688747e-07, + "loss": 0.3259, + "step": 16160 + }, + { + "epoch": 2.75, + "grad_norm": 4.013748515607696, + "learning_rate": 2.1305197066975315e-07, + "loss": 0.3219, + "step": 16165 + }, + { + "epoch": 2.75, + "grad_norm": 5.852245030863292, + "learning_rate": 2.1162604863299241e-07, + "loss": 0.3263, + "step": 16170 + }, + { + "epoch": 2.75, + "grad_norm": 4.875041183822977, + "learning_rate": 2.1020481119162106e-07, + "loss": 0.3197, + "step": 16175 + }, + { + "epoch": 2.75, + "grad_norm": 4.902276796757562, + "learning_rate": 2.0878825973607575e-07, + "loss": 0.3186, + "step": 16180 + }, + { + "epoch": 2.75, + "grad_norm": 4.382093906560683, + "learning_rate": 2.0737639565220568e-07, + "loss": 0.3271, + "step": 16185 + }, + { + "epoch": 2.75, + "grad_norm": 7.1592131547023605, + "learning_rate": 2.059692203212771e-07, + "loss": 0.3238, + "step": 16190 + }, + { + "epoch": 2.75, + "grad_norm": 5.699480539733266, + "learning_rate": 2.0456673511996705e-07, + "loss": 0.3365, + "step": 16195 + }, + { + "epoch": 2.75, + "grad_norm": 4.58969484654952, + "learning_rate": 2.0316894142036303e-07, + "loss": 0.3269, + "step": 16200 + }, + { + "epoch": 2.76, + "grad_norm": 5.709577048665052, + "learning_rate": 2.0177584058996667e-07, + "loss": 0.3193, + "step": 16205 + }, + { + "epoch": 2.76, + "grad_norm": 4.366505950527198, + "learning_rate": 2.0038743399168504e-07, + "loss": 0.3215, + "step": 16210 + }, + { + "epoch": 2.76, + "grad_norm": 4.957699551759988, + "learning_rate": 1.9900372298383374e-07, + "loss": 0.323, + "step": 16215 + }, + { + "epoch": 2.76, + "grad_norm": 4.719601013061237, + "learning_rate": 1.976247089201344e-07, + "loss": 0.3297, + "step": 16220 + }, + { + "epoch": 2.76, + "grad_norm": 3.935689059330765, + "learning_rate": 1.9625039314971394e-07, + "loss": 0.3357, + "step": 16225 + }, + { + "epoch": 2.76, + "grad_norm": 6.791720105209688, + "learning_rate": 1.9488077701710238e-07, + "loss": 0.3336, + "step": 16230 + }, + { + "epoch": 2.76, + "grad_norm": 5.143417367694795, + "learning_rate": 1.9351586186223237e-07, + "loss": 0.3238, + "step": 16235 + }, + { + "epoch": 2.76, + "grad_norm": 8.216116273198738, + "learning_rate": 1.9215564902043738e-07, + "loss": 0.3154, + "step": 16240 + }, + { + "epoch": 2.76, + "grad_norm": 5.107020221296381, + "learning_rate": 1.908001398224496e-07, + "loss": 0.3225, + "step": 16245 + }, + { + "epoch": 2.76, + "grad_norm": 4.588168833266675, + "learning_rate": 1.8944933559440105e-07, + "loss": 0.3143, + "step": 16250 + }, + { + "epoch": 2.76, + "grad_norm": 4.79229475914689, + "learning_rate": 1.8810323765781956e-07, + "loss": 0.3296, + "step": 16255 + }, + { + "epoch": 2.76, + "grad_norm": 4.36897448726827, + "learning_rate": 1.8676184732962953e-07, + "loss": 0.3276, + "step": 16260 + }, + { + "epoch": 2.77, + "grad_norm": 4.567793253702501, + "learning_rate": 1.8542516592214788e-07, + "loss": 0.3323, + "step": 16265 + }, + { + "epoch": 2.77, + "grad_norm": 5.50180730085157, + "learning_rate": 1.840931947430874e-07, + "loss": 0.3236, + "step": 16270 + }, + { + "epoch": 2.77, + "grad_norm": 5.784739469539132, + "learning_rate": 1.8276593509555073e-07, + "loss": 0.3254, + "step": 16275 + }, + { + "epoch": 2.77, + "grad_norm": 5.093679564094168, + "learning_rate": 1.8144338827803253e-07, + "loss": 0.3212, + "step": 16280 + }, + { + "epoch": 2.77, + "grad_norm": 4.188116061505996, + "learning_rate": 1.8012555558441558e-07, + "loss": 0.3268, + "step": 16285 + }, + { + "epoch": 2.77, + "grad_norm": 4.188884125901071, + "learning_rate": 1.7881243830397133e-07, + "loss": 0.3254, + "step": 16290 + }, + { + "epoch": 2.77, + "grad_norm": 4.3028610933926466, + "learning_rate": 1.7750403772135716e-07, + "loss": 0.3221, + "step": 16295 + }, + { + "epoch": 2.77, + "grad_norm": 4.748550442739596, + "learning_rate": 1.7620035511661748e-07, + "loss": 0.3197, + "step": 16300 + }, + { + "epoch": 2.77, + "grad_norm": 4.34403152927218, + "learning_rate": 1.749013917651804e-07, + "loss": 0.317, + "step": 16305 + }, + { + "epoch": 2.77, + "grad_norm": 4.615831309228839, + "learning_rate": 1.736071489378549e-07, + "loss": 0.3144, + "step": 16310 + }, + { + "epoch": 2.77, + "grad_norm": 5.699662803614298, + "learning_rate": 1.7231762790083594e-07, + "loss": 0.3213, + "step": 16315 + }, + { + "epoch": 2.77, + "grad_norm": 4.770633996070142, + "learning_rate": 1.7103282991569548e-07, + "loss": 0.3267, + "step": 16320 + }, + { + "epoch": 2.78, + "grad_norm": 4.955789013505318, + "learning_rate": 1.6975275623938637e-07, + "loss": 0.3246, + "step": 16325 + }, + { + "epoch": 2.78, + "grad_norm": 4.521376850056406, + "learning_rate": 1.6847740812423962e-07, + "loss": 0.3397, + "step": 16330 + }, + { + "epoch": 2.78, + "grad_norm": 5.013497701337801, + "learning_rate": 1.6720678681796165e-07, + "loss": 0.3107, + "step": 16335 + }, + { + "epoch": 2.78, + "grad_norm": 5.053988036087268, + "learning_rate": 1.659408935636364e-07, + "loss": 0.3183, + "step": 16340 + }, + { + "epoch": 2.78, + "grad_norm": 5.341405350563486, + "learning_rate": 1.6467972959972102e-07, + "loss": 0.3217, + "step": 16345 + }, + { + "epoch": 2.78, + "grad_norm": 3.9646969352659425, + "learning_rate": 1.6342329616004683e-07, + "loss": 0.3227, + "step": 16350 + }, + { + "epoch": 2.78, + "grad_norm": 4.1896122643170335, + "learning_rate": 1.6217159447381502e-07, + "loss": 0.3202, + "step": 16355 + }, + { + "epoch": 2.78, + "grad_norm": 4.565785467430683, + "learning_rate": 1.609246257656011e-07, + "loss": 0.3202, + "step": 16360 + }, + { + "epoch": 2.78, + "grad_norm": 6.4492287907440105, + "learning_rate": 1.596823912553469e-07, + "loss": 0.3148, + "step": 16365 + }, + { + "epoch": 2.78, + "grad_norm": 4.0934813034293605, + "learning_rate": 1.584448921583648e-07, + "loss": 0.3248, + "step": 16370 + }, + { + "epoch": 2.78, + "grad_norm": 4.659745455359364, + "learning_rate": 1.5721212968533238e-07, + "loss": 0.3165, + "step": 16375 + }, + { + "epoch": 2.78, + "grad_norm": 4.000420361796629, + "learning_rate": 1.5598410504229554e-07, + "loss": 0.3213, + "step": 16380 + }, + { + "epoch": 2.79, + "grad_norm": 4.169494550177258, + "learning_rate": 1.5476081943066268e-07, + "loss": 0.3222, + "step": 16385 + }, + { + "epoch": 2.79, + "grad_norm": 4.52313110624559, + "learning_rate": 1.5354227404720867e-07, + "loss": 0.3212, + "step": 16390 + }, + { + "epoch": 2.79, + "grad_norm": 5.40933196464466, + "learning_rate": 1.523284700840688e-07, + "loss": 0.3264, + "step": 16395 + }, + { + "epoch": 2.79, + "grad_norm": 5.64794423861692, + "learning_rate": 1.511194087287393e-07, + "loss": 0.3221, + "step": 16400 + }, + { + "epoch": 2.79, + "grad_norm": 4.240945621396869, + "learning_rate": 1.4991509116407842e-07, + "loss": 0.3204, + "step": 16405 + }, + { + "epoch": 2.79, + "grad_norm": 5.319882401349022, + "learning_rate": 1.4871551856830259e-07, + "loss": 0.332, + "step": 16410 + }, + { + "epoch": 2.79, + "grad_norm": 4.365765429318108, + "learning_rate": 1.475206921149852e-07, + "loss": 0.3272, + "step": 16415 + }, + { + "epoch": 2.79, + "grad_norm": 4.018989346460705, + "learning_rate": 1.4633061297305796e-07, + "loss": 0.318, + "step": 16420 + }, + { + "epoch": 2.79, + "grad_norm": 4.323980559831332, + "learning_rate": 1.4514528230680726e-07, + "loss": 0.3299, + "step": 16425 + }, + { + "epoch": 2.79, + "grad_norm": 5.19585151471915, + "learning_rate": 1.4396470127587382e-07, + "loss": 0.3181, + "step": 16430 + }, + { + "epoch": 2.79, + "grad_norm": 6.836398734241736, + "learning_rate": 1.4278887103525153e-07, + "loss": 0.3227, + "step": 16435 + }, + { + "epoch": 2.79, + "grad_norm": 4.60566893832175, + "learning_rate": 1.4161779273528797e-07, + "loss": 0.3225, + "step": 16440 + }, + { + "epoch": 2.8, + "grad_norm": 4.06213953867403, + "learning_rate": 1.4045146752167948e-07, + "loss": 0.3299, + "step": 16445 + }, + { + "epoch": 2.8, + "grad_norm": 5.8865681200680235, + "learning_rate": 1.3928989653547498e-07, + "loss": 0.3276, + "step": 16450 + }, + { + "epoch": 2.8, + "grad_norm": 4.753344877185966, + "learning_rate": 1.3813308091306876e-07, + "loss": 0.3342, + "step": 16455 + }, + { + "epoch": 2.8, + "grad_norm": 5.38252543437366, + "learning_rate": 1.3698102178620664e-07, + "loss": 0.3261, + "step": 16460 + }, + { + "epoch": 2.8, + "grad_norm": 5.443970363526169, + "learning_rate": 1.3583372028197704e-07, + "loss": 0.326, + "step": 16465 + }, + { + "epoch": 2.8, + "grad_norm": 4.581110482318744, + "learning_rate": 1.3469117752281767e-07, + "loss": 0.3259, + "step": 16470 + }, + { + "epoch": 2.8, + "grad_norm": 5.142538575844111, + "learning_rate": 1.335533946265083e-07, + "loss": 0.3255, + "step": 16475 + }, + { + "epoch": 2.8, + "grad_norm": 4.4172771144012435, + "learning_rate": 1.3242037270617292e-07, + "loss": 0.3406, + "step": 16480 + }, + { + "epoch": 2.8, + "grad_norm": 6.996640252062232, + "learning_rate": 1.312921128702771e-07, + "loss": 0.3238, + "step": 16485 + }, + { + "epoch": 2.8, + "grad_norm": 4.147013677924641, + "learning_rate": 1.3016861622262788e-07, + "loss": 0.3222, + "step": 16490 + }, + { + "epoch": 2.8, + "grad_norm": 3.969175635638045, + "learning_rate": 1.2904988386237272e-07, + "loss": 0.3211, + "step": 16495 + }, + { + "epoch": 2.81, + "grad_norm": 4.304735817041214, + "learning_rate": 1.2793591688399665e-07, + "loss": 0.3154, + "step": 16500 + }, + { + "epoch": 2.81, + "grad_norm": 4.417624275374276, + "learning_rate": 1.2682671637732512e-07, + "loss": 0.3281, + "step": 16505 + }, + { + "epoch": 2.81, + "grad_norm": 4.3438042106250485, + "learning_rate": 1.2572228342751737e-07, + "loss": 0.324, + "step": 16510 + }, + { + "epoch": 2.81, + "grad_norm": 5.222667005638983, + "learning_rate": 1.2462261911507124e-07, + "loss": 0.3226, + "step": 16515 + }, + { + "epoch": 2.81, + "grad_norm": 4.666061908982401, + "learning_rate": 1.2352772451581784e-07, + "loss": 0.318, + "step": 16520 + }, + { + "epoch": 2.81, + "grad_norm": 4.977480571113075, + "learning_rate": 1.2243760070092093e-07, + "loss": 0.3197, + "step": 16525 + }, + { + "epoch": 2.81, + "grad_norm": 4.020839554052547, + "learning_rate": 1.21352248736879e-07, + "loss": 0.3143, + "step": 16530 + }, + { + "epoch": 2.81, + "grad_norm": 4.9117798883194865, + "learning_rate": 1.2027166968552163e-07, + "loss": 0.3185, + "step": 16535 + }, + { + "epoch": 2.81, + "grad_norm": 5.00269934718407, + "learning_rate": 1.19195864604007e-07, + "loss": 0.3221, + "step": 16540 + }, + { + "epoch": 2.81, + "grad_norm": 6.2316602057613455, + "learning_rate": 1.1812483454482493e-07, + "loss": 0.3251, + "step": 16545 + }, + { + "epoch": 2.81, + "grad_norm": 3.958986638942587, + "learning_rate": 1.1705858055579389e-07, + "loss": 0.316, + "step": 16550 + }, + { + "epoch": 2.81, + "grad_norm": 4.43777934051335, + "learning_rate": 1.1599710368005723e-07, + "loss": 0.3128, + "step": 16555 + }, + { + "epoch": 2.82, + "grad_norm": 4.872643950834902, + "learning_rate": 1.149404049560876e-07, + "loss": 0.3172, + "step": 16560 + }, + { + "epoch": 2.82, + "grad_norm": 5.0242774912669885, + "learning_rate": 1.1388848541768193e-07, + "loss": 0.3302, + "step": 16565 + }, + { + "epoch": 2.82, + "grad_norm": 4.6394760087647064, + "learning_rate": 1.1284134609396091e-07, + "loss": 0.3134, + "step": 16570 + }, + { + "epoch": 2.82, + "grad_norm": 4.20248467236056, + "learning_rate": 1.117989880093695e-07, + "loss": 0.3242, + "step": 16575 + }, + { + "epoch": 2.82, + "grad_norm": 4.993861326186423, + "learning_rate": 1.107614121836742e-07, + "loss": 0.3232, + "step": 16580 + }, + { + "epoch": 2.82, + "grad_norm": 4.140204526928821, + "learning_rate": 1.0972861963196469e-07, + "loss": 0.3131, + "step": 16585 + }, + { + "epoch": 2.82, + "grad_norm": 6.362294713393221, + "learning_rate": 1.0870061136464772e-07, + "loss": 0.3183, + "step": 16590 + }, + { + "epoch": 2.82, + "grad_norm": 6.061443962737918, + "learning_rate": 1.0767738838745379e-07, + "loss": 0.32, + "step": 16595 + }, + { + "epoch": 2.82, + "grad_norm": 5.021965056921146, + "learning_rate": 1.066589517014277e-07, + "loss": 0.3165, + "step": 16600 + }, + { + "epoch": 2.82, + "grad_norm": 4.388055243217996, + "learning_rate": 1.0564530230293468e-07, + "loss": 0.3165, + "step": 16605 + }, + { + "epoch": 2.82, + "grad_norm": 5.35685916039647, + "learning_rate": 1.0463644118365535e-07, + "loss": 0.3169, + "step": 16610 + }, + { + "epoch": 2.82, + "grad_norm": 5.082810603055631, + "learning_rate": 1.0363236933058462e-07, + "loss": 0.3223, + "step": 16615 + }, + { + "epoch": 2.83, + "grad_norm": 4.12496359069897, + "learning_rate": 1.0263308772603397e-07, + "loss": 0.3259, + "step": 16620 + }, + { + "epoch": 2.83, + "grad_norm": 4.348108139298327, + "learning_rate": 1.0163859734762749e-07, + "loss": 0.3187, + "step": 16625 + }, + { + "epoch": 2.83, + "grad_norm": 4.1193485171849975, + "learning_rate": 1.006488991683019e-07, + "loss": 0.3202, + "step": 16630 + }, + { + "epoch": 2.83, + "grad_norm": 4.8436774397202775, + "learning_rate": 9.96639941563049e-08, + "loss": 0.3316, + "step": 16635 + }, + { + "epoch": 2.83, + "grad_norm": 4.415829203215659, + "learning_rate": 9.868388327519685e-08, + "loss": 0.3238, + "step": 16640 + }, + { + "epoch": 2.83, + "grad_norm": 4.197213779335068, + "learning_rate": 9.770856748384516e-08, + "loss": 0.3248, + "step": 16645 + }, + { + "epoch": 2.83, + "grad_norm": 4.196155455714435, + "learning_rate": 9.673804773642936e-08, + "loss": 0.3249, + "step": 16650 + }, + { + "epoch": 2.83, + "grad_norm": 7.855173996053146, + "learning_rate": 9.577232498243383e-08, + "loss": 0.3245, + "step": 16655 + }, + { + "epoch": 2.83, + "grad_norm": 5.120386816838393, + "learning_rate": 9.481140016665169e-08, + "loss": 0.3151, + "step": 16660 + }, + { + "epoch": 2.83, + "grad_norm": 6.56032471761875, + "learning_rate": 9.385527422918095e-08, + "loss": 0.3141, + "step": 16665 + }, + { + "epoch": 2.83, + "grad_norm": 4.838102733097684, + "learning_rate": 9.290394810542669e-08, + "loss": 0.319, + "step": 16670 + }, + { + "epoch": 2.83, + "grad_norm": 4.237242674037893, + "learning_rate": 9.195742272609609e-08, + "loss": 0.3299, + "step": 16675 + }, + { + "epoch": 2.84, + "grad_norm": 5.069975175557291, + "learning_rate": 9.101569901719953e-08, + "loss": 0.3269, + "step": 16680 + }, + { + "epoch": 2.84, + "grad_norm": 4.70395556339215, + "learning_rate": 9.007877790005281e-08, + "loss": 0.3216, + "step": 16685 + }, + { + "epoch": 2.84, + "grad_norm": 4.484240354831281, + "learning_rate": 8.91466602912694e-08, + "loss": 0.3258, + "step": 16690 + }, + { + "epoch": 2.84, + "grad_norm": 4.619820251785469, + "learning_rate": 8.821934710276648e-08, + "loss": 0.3264, + "step": 16695 + }, + { + "epoch": 2.84, + "grad_norm": 4.306695765403448, + "learning_rate": 8.72968392417578e-08, + "loss": 0.3186, + "step": 16700 + }, + { + "epoch": 2.84, + "grad_norm": 4.678091335257884, + "learning_rate": 8.637913761075922e-08, + "loss": 0.3136, + "step": 16705 + }, + { + "epoch": 2.84, + "grad_norm": 4.140977332989994, + "learning_rate": 8.546624310758256e-08, + "loss": 0.319, + "step": 16710 + }, + { + "epoch": 2.84, + "grad_norm": 4.245543097098709, + "learning_rate": 8.455815662533617e-08, + "loss": 0.3191, + "step": 16715 + }, + { + "epoch": 2.84, + "grad_norm": 4.2871480871847005, + "learning_rate": 8.36548790524272e-08, + "loss": 0.3154, + "step": 16720 + }, + { + "epoch": 2.84, + "grad_norm": 5.821588086606466, + "learning_rate": 8.27564112725554e-08, + "loss": 0.3282, + "step": 16725 + }, + { + "epoch": 2.84, + "grad_norm": 4.331167692980463, + "learning_rate": 8.186275416471656e-08, + "loss": 0.3108, + "step": 16730 + }, + { + "epoch": 2.85, + "grad_norm": 4.137864192771517, + "learning_rate": 8.097390860319909e-08, + "loss": 0.3191, + "step": 16735 + }, + { + "epoch": 2.85, + "grad_norm": 6.561135191481652, + "learning_rate": 8.008987545758518e-08, + "loss": 0.3168, + "step": 16740 + }, + { + "epoch": 2.85, + "grad_norm": 5.209127320403764, + "learning_rate": 7.921065559274688e-08, + "loss": 0.3311, + "step": 16745 + }, + { + "epoch": 2.85, + "grad_norm": 4.808937295788496, + "learning_rate": 7.833624986885058e-08, + "loss": 0.3204, + "step": 16750 + }, + { + "epoch": 2.85, + "grad_norm": 4.061855686895446, + "learning_rate": 7.746665914134977e-08, + "loss": 0.3224, + "step": 16755 + }, + { + "epoch": 2.85, + "grad_norm": 6.742392831215071, + "learning_rate": 7.66018842609889e-08, + "loss": 0.3245, + "step": 16760 + }, + { + "epoch": 2.85, + "grad_norm": 4.374524764337193, + "learning_rate": 7.574192607380071e-08, + "loss": 0.3318, + "step": 16765 + }, + { + "epoch": 2.85, + "grad_norm": 4.019406843271415, + "learning_rate": 7.488678542110495e-08, + "loss": 0.3212, + "step": 16770 + }, + { + "epoch": 2.85, + "grad_norm": 4.926694113312759, + "learning_rate": 7.403646313950962e-08, + "loss": 0.3328, + "step": 16775 + }, + { + "epoch": 2.85, + "grad_norm": 4.566359108289269, + "learning_rate": 7.319096006090654e-08, + "loss": 0.3108, + "step": 16780 + }, + { + "epoch": 2.85, + "grad_norm": 4.408310731840014, + "learning_rate": 7.235027701247621e-08, + "loss": 0.3212, + "step": 16785 + }, + { + "epoch": 2.85, + "grad_norm": 4.152290333016591, + "learning_rate": 7.151441481667965e-08, + "loss": 0.3355, + "step": 16790 + }, + { + "epoch": 2.86, + "grad_norm": 4.565863091344188, + "learning_rate": 7.068337429126437e-08, + "loss": 0.3247, + "step": 16795 + }, + { + "epoch": 2.86, + "grad_norm": 4.721947965138385, + "learning_rate": 6.985715624925948e-08, + "loss": 0.3219, + "step": 16800 + }, + { + "epoch": 2.86, + "grad_norm": 4.5162074092895645, + "learning_rate": 6.903576149897617e-08, + "loss": 0.3204, + "step": 16805 + }, + { + "epoch": 2.86, + "grad_norm": 4.236228619641246, + "learning_rate": 6.821919084400774e-08, + "loss": 0.3308, + "step": 16810 + }, + { + "epoch": 2.86, + "grad_norm": 4.141813607931332, + "learning_rate": 6.740744508322683e-08, + "loss": 0.3135, + "step": 16815 + }, + { + "epoch": 2.86, + "grad_norm": 5.665322840238465, + "learning_rate": 6.660052501078596e-08, + "loss": 0.3252, + "step": 16820 + }, + { + "epoch": 2.86, + "grad_norm": 4.767201979259087, + "learning_rate": 6.579843141611697e-08, + "loss": 0.3273, + "step": 16825 + }, + { + "epoch": 2.86, + "grad_norm": 4.165531389035974, + "learning_rate": 6.50011650839305e-08, + "loss": 0.3223, + "step": 16830 + }, + { + "epoch": 2.86, + "grad_norm": 4.192803078576282, + "learning_rate": 6.420872679421208e-08, + "loss": 0.3151, + "step": 16835 + }, + { + "epoch": 2.86, + "grad_norm": 5.385516489304573, + "learning_rate": 6.342111732222655e-08, + "loss": 0.3328, + "step": 16840 + }, + { + "epoch": 2.86, + "grad_norm": 5.425070913092375, + "learning_rate": 6.263833743851367e-08, + "loss": 0.3204, + "step": 16845 + }, + { + "epoch": 2.86, + "grad_norm": 4.471432430744481, + "learning_rate": 6.186038790888749e-08, + "loss": 0.3248, + "step": 16850 + }, + { + "epoch": 2.87, + "grad_norm": 4.3856903583224875, + "learning_rate": 6.108726949443756e-08, + "loss": 0.3201, + "step": 16855 + }, + { + "epoch": 2.87, + "grad_norm": 4.589501882050007, + "learning_rate": 6.031898295152605e-08, + "loss": 0.3224, + "step": 16860 + }, + { + "epoch": 2.87, + "grad_norm": 5.674711932036349, + "learning_rate": 5.955552903178896e-08, + "loss": 0.3267, + "step": 16865 + }, + { + "epoch": 2.87, + "grad_norm": 4.10918736978263, + "learning_rate": 5.8796908482132706e-08, + "loss": 0.3152, + "step": 16870 + }, + { + "epoch": 2.87, + "grad_norm": 4.9623222898979105, + "learning_rate": 5.80431220447375e-08, + "loss": 0.3255, + "step": 16875 + }, + { + "epoch": 2.87, + "grad_norm": 4.83341224963951, + "learning_rate": 5.7294170457052326e-08, + "loss": 0.3129, + "step": 16880 + }, + { + "epoch": 2.87, + "grad_norm": 5.207285877907312, + "learning_rate": 5.655005445179662e-08, + "loss": 0.3302, + "step": 16885 + }, + { + "epoch": 2.87, + "grad_norm": 4.267072915271972, + "learning_rate": 5.581077475695973e-08, + "loss": 0.3282, + "step": 16890 + }, + { + "epoch": 2.87, + "grad_norm": 4.213948223184893, + "learning_rate": 5.50763320957981e-08, + "loss": 0.328, + "step": 16895 + }, + { + "epoch": 2.87, + "grad_norm": 4.630031411426699, + "learning_rate": 5.4346727186837534e-08, + "loss": 0.3341, + "step": 16900 + }, + { + "epoch": 2.87, + "grad_norm": 6.480499974246501, + "learning_rate": 5.362196074386983e-08, + "loss": 0.3305, + "step": 16905 + }, + { + "epoch": 2.87, + "grad_norm": 5.359901605698903, + "learning_rate": 5.290203347595335e-08, + "loss": 0.3232, + "step": 16910 + }, + { + "epoch": 2.88, + "grad_norm": 4.488707476211648, + "learning_rate": 5.218694608741304e-08, + "loss": 0.3237, + "step": 16915 + }, + { + "epoch": 2.88, + "grad_norm": 4.183021393567609, + "learning_rate": 5.1476699277837605e-08, + "loss": 0.3186, + "step": 16920 + }, + { + "epoch": 2.88, + "grad_norm": 5.804647675972266, + "learning_rate": 5.077129374208012e-08, + "loss": 0.3257, + "step": 16925 + }, + { + "epoch": 2.88, + "grad_norm": 4.494163540567883, + "learning_rate": 5.007073017025965e-08, + "loss": 0.3314, + "step": 16930 + }, + { + "epoch": 2.88, + "grad_norm": 5.1840909862780515, + "learning_rate": 4.9375009247754626e-08, + "loss": 0.3279, + "step": 16935 + }, + { + "epoch": 2.88, + "grad_norm": 4.572678380765821, + "learning_rate": 4.8684131655208353e-08, + "loss": 0.3116, + "step": 16940 + }, + { + "epoch": 2.88, + "grad_norm": 4.196403827581241, + "learning_rate": 4.799809806852518e-08, + "loss": 0.3184, + "step": 16945 + }, + { + "epoch": 2.88, + "grad_norm": 4.357098767447156, + "learning_rate": 4.7316909158869884e-08, + "loss": 0.3297, + "step": 16950 + }, + { + "epoch": 2.88, + "grad_norm": 5.343916649756927, + "learning_rate": 4.6640565592668276e-08, + "loss": 0.311, + "step": 16955 + }, + { + "epoch": 2.88, + "grad_norm": 4.291618378770368, + "learning_rate": 4.5969068031604945e-08, + "loss": 0.329, + "step": 16960 + }, + { + "epoch": 2.88, + "grad_norm": 4.25102189320903, + "learning_rate": 4.530241713262495e-08, + "loss": 0.3209, + "step": 16965 + }, + { + "epoch": 2.89, + "grad_norm": 4.134754275534955, + "learning_rate": 4.4640613547929925e-08, + "loss": 0.3101, + "step": 16970 + }, + { + "epoch": 2.89, + "grad_norm": 4.545599286611421, + "learning_rate": 4.398365792498083e-08, + "loss": 0.3242, + "step": 16975 + }, + { + "epoch": 2.89, + "grad_norm": 5.196296248690342, + "learning_rate": 4.3331550906494656e-08, + "loss": 0.3239, + "step": 16980 + }, + { + "epoch": 2.89, + "grad_norm": 4.364358485774829, + "learning_rate": 4.268429313044553e-08, + "loss": 0.3226, + "step": 16985 + }, + { + "epoch": 2.89, + "grad_norm": 5.446202143056948, + "learning_rate": 4.204188523006303e-08, + "loss": 0.3247, + "step": 16990 + }, + { + "epoch": 2.89, + "grad_norm": 4.79378156017565, + "learning_rate": 4.140432783383219e-08, + "loss": 0.3221, + "step": 16995 + }, + { + "epoch": 2.89, + "grad_norm": 5.010838753010564, + "learning_rate": 4.077162156549297e-08, + "loss": 0.3324, + "step": 17000 + }, + { + "epoch": 2.89, + "grad_norm": 3.9813207535154596, + "learning_rate": 4.0143767044038554e-08, + "loss": 0.3231, + "step": 17005 + }, + { + "epoch": 2.89, + "grad_norm": 5.901540079957885, + "learning_rate": 3.9520764883715924e-08, + "loss": 0.3275, + "step": 17010 + }, + { + "epoch": 2.89, + "grad_norm": 4.208544291007741, + "learning_rate": 3.8902615694025313e-08, + "loss": 0.3186, + "step": 17015 + }, + { + "epoch": 2.89, + "grad_norm": 4.042927808490137, + "learning_rate": 3.828932007971797e-08, + "loss": 0.3151, + "step": 17020 + }, + { + "epoch": 2.89, + "grad_norm": 4.514916487809097, + "learning_rate": 3.768087864079839e-08, + "loss": 0.3234, + "step": 17025 + }, + { + "epoch": 2.9, + "grad_norm": 5.432387190050278, + "learning_rate": 3.707729197252097e-08, + "loss": 0.3174, + "step": 17030 + }, + { + "epoch": 2.9, + "grad_norm": 6.9894227760113425, + "learning_rate": 3.6478560665390574e-08, + "loss": 0.33, + "step": 17035 + }, + { + "epoch": 2.9, + "grad_norm": 4.295747928089492, + "learning_rate": 3.588468530516198e-08, + "loss": 0.3266, + "step": 17040 + }, + { + "epoch": 2.9, + "grad_norm": 4.407466468401592, + "learning_rate": 3.529566647284044e-08, + "loss": 0.3257, + "step": 17045 + }, + { + "epoch": 2.9, + "grad_norm": 4.747434671821218, + "learning_rate": 3.471150474467777e-08, + "loss": 0.3323, + "step": 17050 + }, + { + "epoch": 2.9, + "grad_norm": 6.148672477395743, + "learning_rate": 3.413220069217627e-08, + "loss": 0.3232, + "step": 17055 + }, + { + "epoch": 2.9, + "grad_norm": 4.212980371901559, + "learning_rate": 3.355775488208368e-08, + "loss": 0.3205, + "step": 17060 + }, + { + "epoch": 2.9, + "grad_norm": 4.595464952249602, + "learning_rate": 3.298816787639714e-08, + "loss": 0.3308, + "step": 17065 + }, + { + "epoch": 2.9, + "grad_norm": 4.261453414240882, + "learning_rate": 3.242344023235755e-08, + "loss": 0.3235, + "step": 17070 + }, + { + "epoch": 2.9, + "grad_norm": 4.140376013153457, + "learning_rate": 3.186357250245409e-08, + "loss": 0.3192, + "step": 17075 + }, + { + "epoch": 2.9, + "grad_norm": 4.201478258105787, + "learning_rate": 3.1308565234420275e-08, + "loss": 0.3211, + "step": 17080 + }, + { + "epoch": 2.9, + "grad_norm": 5.146822624316797, + "learning_rate": 3.0758418971233995e-08, + "loss": 0.3154, + "step": 17085 + }, + { + "epoch": 2.91, + "grad_norm": 4.174234028597232, + "learning_rate": 3.0213134251119716e-08, + "loss": 0.3183, + "step": 17090 + }, + { + "epoch": 2.91, + "grad_norm": 4.298240807790573, + "learning_rate": 2.967271160754237e-08, + "loss": 0.3268, + "step": 17095 + }, + { + "epoch": 2.91, + "grad_norm": 4.227795333228735, + "learning_rate": 2.9137151569213486e-08, + "loss": 0.3214, + "step": 17100 + }, + { + "epoch": 2.91, + "grad_norm": 4.210920820302368, + "learning_rate": 2.8606454660085047e-08, + "loss": 0.3201, + "step": 17105 + }, + { + "epoch": 2.91, + "grad_norm": 4.688694831097193, + "learning_rate": 2.8080621399352857e-08, + "loss": 0.3274, + "step": 17110 + }, + { + "epoch": 2.91, + "grad_norm": 4.016657299349907, + "learning_rate": 2.7559652301452632e-08, + "loss": 0.3165, + "step": 17115 + }, + { + "epoch": 2.91, + "grad_norm": 4.427423155416518, + "learning_rate": 2.704354787606389e-08, + "loss": 0.3265, + "step": 17120 + }, + { + "epoch": 2.91, + "grad_norm": 4.048773944525423, + "learning_rate": 2.653230862810441e-08, + "loss": 0.316, + "step": 17125 + }, + { + "epoch": 2.91, + "grad_norm": 4.363179533603372, + "learning_rate": 2.602593505773354e-08, + "loss": 0.3294, + "step": 17130 + }, + { + "epoch": 2.91, + "grad_norm": 4.343244247482888, + "learning_rate": 2.5524427660351125e-08, + "loss": 0.3286, + "step": 17135 + }, + { + "epoch": 2.91, + "grad_norm": 5.1011993397027995, + "learning_rate": 2.5027786926594132e-08, + "loss": 0.3294, + "step": 17140 + }, + { + "epoch": 2.91, + "grad_norm": 5.095024825445343, + "learning_rate": 2.453601334234057e-08, + "loss": 0.3184, + "step": 17145 + }, + { + "epoch": 2.92, + "grad_norm": 5.533694904679112, + "learning_rate": 2.4049107388705028e-08, + "loss": 0.3165, + "step": 17150 + }, + { + "epoch": 2.92, + "grad_norm": 4.338434029875889, + "learning_rate": 2.356706954204091e-08, + "loss": 0.3194, + "step": 17155 + }, + { + "epoch": 2.92, + "grad_norm": 4.59201757614292, + "learning_rate": 2.3089900273938758e-08, + "loss": 0.3288, + "step": 17160 + }, + { + "epoch": 2.92, + "grad_norm": 4.1761127099824575, + "learning_rate": 2.2617600051226818e-08, + "loss": 0.3285, + "step": 17165 + }, + { + "epoch": 2.92, + "grad_norm": 4.518604552105859, + "learning_rate": 2.2150169335968807e-08, + "loss": 0.322, + "step": 17170 + }, + { + "epoch": 2.92, + "grad_norm": 4.641904310325613, + "learning_rate": 2.168760858546448e-08, + "loss": 0.3237, + "step": 17175 + }, + { + "epoch": 2.92, + "grad_norm": 4.752248201594694, + "learning_rate": 2.1229918252249627e-08, + "loss": 0.3242, + "step": 17180 + }, + { + "epoch": 2.92, + "grad_norm": 4.765539723924974, + "learning_rate": 2.0777098784095507e-08, + "loss": 0.3281, + "step": 17185 + }, + { + "epoch": 2.92, + "grad_norm": 4.333491188039683, + "learning_rate": 2.0329150624006645e-08, + "loss": 0.3343, + "step": 17190 + }, + { + "epoch": 2.92, + "grad_norm": 4.9243081578917645, + "learning_rate": 1.9886074210223592e-08, + "loss": 0.3285, + "step": 17195 + }, + { + "epoch": 2.92, + "grad_norm": 4.142304534229642, + "learning_rate": 1.9447869976220167e-08, + "loss": 0.3264, + "step": 17200 + }, + { + "epoch": 2.93, + "grad_norm": 4.05735236631641, + "learning_rate": 1.901453835070233e-08, + "loss": 0.3115, + "step": 17205 + }, + { + "epoch": 2.93, + "grad_norm": 3.95977328598609, + "learning_rate": 1.858607975761095e-08, + "loss": 0.3274, + "step": 17210 + }, + { + "epoch": 2.93, + "grad_norm": 4.0736743171757075, + "learning_rate": 1.816249461611852e-08, + "loss": 0.3267, + "step": 17215 + }, + { + "epoch": 2.93, + "grad_norm": 4.349859105884953, + "learning_rate": 1.774378334062965e-08, + "loss": 0.3272, + "step": 17220 + }, + { + "epoch": 2.93, + "grad_norm": 4.296848237552809, + "learning_rate": 1.732994634078111e-08, + "loss": 0.3187, + "step": 17225 + }, + { + "epoch": 2.93, + "grad_norm": 4.696438474912808, + "learning_rate": 1.692098402144071e-08, + "loss": 0.331, + "step": 17230 + }, + { + "epoch": 2.93, + "grad_norm": 4.615718612464431, + "learning_rate": 1.6516896782706736e-08, + "loss": 0.3345, + "step": 17235 + }, + { + "epoch": 2.93, + "grad_norm": 4.568930088531357, + "learning_rate": 1.6117685019909623e-08, + "loss": 0.322, + "step": 17240 + }, + { + "epoch": 2.93, + "grad_norm": 5.404550793819637, + "learning_rate": 1.5723349123608067e-08, + "loss": 0.3163, + "step": 17245 + }, + { + "epoch": 2.93, + "grad_norm": 5.629118831077295, + "learning_rate": 1.5333889479592356e-08, + "loss": 0.3237, + "step": 17250 + }, + { + "epoch": 2.93, + "grad_norm": 4.603765706275918, + "learning_rate": 1.4949306468880486e-08, + "loss": 0.3176, + "step": 17255 + }, + { + "epoch": 2.93, + "grad_norm": 4.365997654720079, + "learning_rate": 1.456960046772149e-08, + "loss": 0.3255, + "step": 17260 + }, + { + "epoch": 2.94, + "grad_norm": 4.130421217693394, + "learning_rate": 1.4194771847590994e-08, + "loss": 0.3187, + "step": 17265 + }, + { + "epoch": 2.94, + "grad_norm": 4.305110035082924, + "learning_rate": 1.3824820975194553e-08, + "loss": 0.3094, + "step": 17270 + }, + { + "epoch": 2.94, + "grad_norm": 4.422436622511799, + "learning_rate": 1.3459748212464318e-08, + "loss": 0.3225, + "step": 17275 + }, + { + "epoch": 2.94, + "grad_norm": 4.431422170606385, + "learning_rate": 1.3099553916561813e-08, + "loss": 0.3287, + "step": 17280 + }, + { + "epoch": 2.94, + "grad_norm": 4.241567806730011, + "learning_rate": 1.2744238439874046e-08, + "loss": 0.3261, + "step": 17285 + }, + { + "epoch": 2.94, + "grad_norm": 4.461893699555121, + "learning_rate": 1.239380213001684e-08, + "loss": 0.3182, + "step": 17290 + }, + { + "epoch": 2.94, + "grad_norm": 5.755366473966939, + "learning_rate": 1.2048245329829844e-08, + "loss": 0.3261, + "step": 17295 + }, + { + "epoch": 2.94, + "grad_norm": 4.086374403285044, + "learning_rate": 1.1707568377382072e-08, + "loss": 0.3237, + "step": 17300 + }, + { + "epoch": 2.94, + "grad_norm": 4.37192574562722, + "learning_rate": 1.137177160596581e-08, + "loss": 0.3209, + "step": 17305 + }, + { + "epoch": 2.94, + "grad_norm": 4.113981142802513, + "learning_rate": 1.1040855344101043e-08, + "loss": 0.3163, + "step": 17310 + }, + { + "epoch": 2.94, + "grad_norm": 4.4901520195537765, + "learning_rate": 1.0714819915531582e-08, + "loss": 0.3317, + "step": 17315 + }, + { + "epoch": 2.94, + "grad_norm": 4.06515915220569, + "learning_rate": 1.0393665639226724e-08, + "loss": 0.3286, + "step": 17320 + }, + { + "epoch": 2.95, + "grad_norm": 4.305528353338569, + "learning_rate": 1.007739282938014e-08, + "loss": 0.3205, + "step": 17325 + }, + { + "epoch": 2.95, + "grad_norm": 5.283052479749407, + "learning_rate": 9.766001795410984e-09, + "loss": 0.3217, + "step": 17330 + }, + { + "epoch": 2.95, + "grad_norm": 4.007913866226925, + "learning_rate": 9.459492841960572e-09, + "loss": 0.319, + "step": 17335 + }, + { + "epoch": 2.95, + "grad_norm": 4.892225816928856, + "learning_rate": 9.157866268895144e-09, + "loss": 0.3195, + "step": 17340 + }, + { + "epoch": 2.95, + "grad_norm": 4.2700543147223975, + "learning_rate": 8.861122371303654e-09, + "loss": 0.3251, + "step": 17345 + }, + { + "epoch": 2.95, + "grad_norm": 4.746315215212394, + "learning_rate": 8.569261439499432e-09, + "loss": 0.3291, + "step": 17350 + }, + { + "epoch": 2.95, + "grad_norm": 4.227497950738878, + "learning_rate": 8.282283759017962e-09, + "loss": 0.3251, + "step": 17355 + }, + { + "epoch": 2.95, + "grad_norm": 4.526948346687582, + "learning_rate": 8.000189610616883e-09, + "loss": 0.3167, + "step": 17360 + }, + { + "epoch": 2.95, + "grad_norm": 4.397402173863833, + "learning_rate": 7.722979270275988e-09, + "loss": 0.3139, + "step": 17365 + }, + { + "epoch": 2.95, + "grad_norm": 4.019686407837929, + "learning_rate": 7.450653009198338e-09, + "loss": 0.3203, + "step": 17370 + }, + { + "epoch": 2.95, + "grad_norm": 4.113894086863957, + "learning_rate": 7.18321109380804e-09, + "loss": 0.3153, + "step": 17375 + }, + { + "epoch": 2.95, + "grad_norm": 3.9802828861675468, + "learning_rate": 6.920653785750797e-09, + "loss": 0.3123, + "step": 17380 + }, + { + "epoch": 2.96, + "grad_norm": 7.397341279147858, + "learning_rate": 6.662981341892805e-09, + "loss": 0.3222, + "step": 17385 + }, + { + "epoch": 2.96, + "grad_norm": 4.2876178038487325, + "learning_rate": 6.410194014322413e-09, + "loss": 0.3254, + "step": 17390 + }, + { + "epoch": 2.96, + "grad_norm": 6.362407584057065, + "learning_rate": 6.162292050348462e-09, + "loss": 0.3215, + "step": 17395 + }, + { + "epoch": 2.96, + "grad_norm": 5.260197573800328, + "learning_rate": 5.919275692500281e-09, + "loss": 0.3342, + "step": 17400 + }, + { + "epoch": 2.96, + "grad_norm": 5.652129288543425, + "learning_rate": 5.681145178526581e-09, + "loss": 0.3266, + "step": 17405 + }, + { + "epoch": 2.96, + "grad_norm": 4.0329427262401225, + "learning_rate": 5.4479007413976715e-09, + "loss": 0.3261, + "step": 17410 + }, + { + "epoch": 2.96, + "grad_norm": 4.212095217891655, + "learning_rate": 5.21954260930213e-09, + "loss": 0.3189, + "step": 17415 + }, + { + "epoch": 2.96, + "grad_norm": 4.388186784150905, + "learning_rate": 4.996071005649583e-09, + "loss": 0.3258, + "step": 17420 + }, + { + "epoch": 2.96, + "grad_norm": 4.403577402430027, + "learning_rate": 4.777486149067923e-09, + "loss": 0.3229, + "step": 17425 + }, + { + "epoch": 2.96, + "grad_norm": 5.372666014770122, + "learning_rate": 4.563788253404422e-09, + "loss": 0.3168, + "step": 17430 + }, + { + "epoch": 2.96, + "grad_norm": 5.920216648474848, + "learning_rate": 4.3549775277262895e-09, + "loss": 0.3219, + "step": 17435 + }, + { + "epoch": 2.96, + "grad_norm": 4.75224480939319, + "learning_rate": 4.151054176317337e-09, + "loss": 0.3224, + "step": 17440 + }, + { + "epoch": 2.97, + "grad_norm": 5.17217886620779, + "learning_rate": 3.9520183986829776e-09, + "loss": 0.3291, + "step": 17445 + }, + { + "epoch": 2.97, + "grad_norm": 4.521520496183765, + "learning_rate": 3.757870389544116e-09, + "loss": 0.3193, + "step": 17450 + }, + { + "epoch": 2.97, + "grad_norm": 3.9911833599850017, + "learning_rate": 3.5686103388410385e-09, + "loss": 0.3261, + "step": 17455 + }, + { + "epoch": 2.97, + "grad_norm": 5.8047227129063055, + "learning_rate": 3.384238431732301e-09, + "loss": 0.3251, + "step": 17460 + }, + { + "epoch": 2.97, + "grad_norm": 4.3984464082370645, + "learning_rate": 3.2047548485941714e-09, + "loss": 0.3188, + "step": 17465 + }, + { + "epoch": 2.97, + "grad_norm": 4.579762521361893, + "learning_rate": 3.0301597650195247e-09, + "loss": 0.3259, + "step": 17470 + }, + { + "epoch": 2.97, + "grad_norm": 4.880006797112499, + "learning_rate": 2.8604533518200585e-09, + "loss": 0.3245, + "step": 17475 + }, + { + "epoch": 2.97, + "grad_norm": 4.29310428512058, + "learning_rate": 2.6956357750235195e-09, + "loss": 0.3288, + "step": 17480 + }, + { + "epoch": 2.97, + "grad_norm": 5.429366982118829, + "learning_rate": 2.53570719587648e-09, + "loss": 0.3261, + "step": 17485 + }, + { + "epoch": 2.97, + "grad_norm": 5.47672322500388, + "learning_rate": 2.3806677708398952e-09, + "loss": 0.326, + "step": 17490 + }, + { + "epoch": 2.97, + "grad_norm": 4.315788135319219, + "learning_rate": 2.230517651594655e-09, + "loss": 0.3145, + "step": 17495 + }, + { + "epoch": 2.98, + "grad_norm": 4.145232204854416, + "learning_rate": 2.0852569850354778e-09, + "loss": 0.3167, + "step": 17500 + }, + { + "epoch": 2.98, + "grad_norm": 4.61449092787545, + "learning_rate": 1.9448859132747965e-09, + "loss": 0.3093, + "step": 17505 + }, + { + "epoch": 2.98, + "grad_norm": 6.349955321675779, + "learning_rate": 1.809404573642204e-09, + "loss": 0.3289, + "step": 17510 + }, + { + "epoch": 2.98, + "grad_norm": 5.081659098663104, + "learning_rate": 1.6788130986816754e-09, + "loss": 0.3171, + "step": 17515 + }, + { + "epoch": 2.98, + "grad_norm": 5.59987534077367, + "learning_rate": 1.553111616155456e-09, + "loss": 0.3243, + "step": 17520 + }, + { + "epoch": 2.98, + "grad_norm": 4.241357555063181, + "learning_rate": 1.432300249040175e-09, + "loss": 0.3163, + "step": 17525 + }, + { + "epoch": 2.98, + "grad_norm": 4.650346457651398, + "learning_rate": 1.316379115529065e-09, + "loss": 0.3216, + "step": 17530 + }, + { + "epoch": 2.98, + "grad_norm": 4.413006568192071, + "learning_rate": 1.2053483290308533e-09, + "loss": 0.3165, + "step": 17535 + }, + { + "epoch": 2.98, + "grad_norm": 5.247458222002462, + "learning_rate": 1.099207998169205e-09, + "loss": 0.3255, + "step": 17540 + }, + { + "epoch": 2.98, + "grad_norm": 4.332420786778957, + "learning_rate": 9.979582267855004e-10, + "loss": 0.3211, + "step": 17545 + }, + { + "epoch": 2.98, + "grad_norm": 4.774136612916247, + "learning_rate": 9.015991139338376e-10, + "loss": 0.3266, + "step": 17550 + }, + { + "epoch": 2.98, + "grad_norm": 4.276016406258255, + "learning_rate": 8.101307538854741e-10, + "loss": 0.3287, + "step": 17555 + }, + { + "epoch": 2.99, + "grad_norm": 3.9933204057622826, + "learning_rate": 7.235532361266062e-10, + "loss": 0.3267, + "step": 17560 + }, + { + "epoch": 2.99, + "grad_norm": 4.015434379642638, + "learning_rate": 6.418666453578137e-10, + "loss": 0.3279, + "step": 17565 + }, + { + "epoch": 2.99, + "grad_norm": 4.3100411946404105, + "learning_rate": 5.650710614957255e-10, + "loss": 0.3261, + "step": 17570 + }, + { + "epoch": 2.99, + "grad_norm": 5.620246737157527, + "learning_rate": 4.931665596713542e-10, + "loss": 0.3089, + "step": 17575 + }, + { + "epoch": 2.99, + "grad_norm": 4.131919436844451, + "learning_rate": 4.2615321023065094e-10, + "loss": 0.329, + "step": 17580 + }, + { + "epoch": 2.99, + "grad_norm": 5.213045629785342, + "learning_rate": 3.6403107873450583e-10, + "loss": 0.3256, + "step": 17585 + }, + { + "epoch": 2.99, + "grad_norm": 4.629428548039462, + "learning_rate": 3.068002259593028e-10, + "loss": 0.3195, + "step": 17590 + }, + { + "epoch": 2.99, + "grad_norm": 4.332419507807568, + "learning_rate": 2.5446070789525437e-10, + "loss": 0.3204, + "step": 17595 + }, + { + "epoch": 2.99, + "grad_norm": 5.240834030496161, + "learning_rate": 2.0701257574751165e-10, + "loss": 0.3203, + "step": 17600 + }, + { + "epoch": 2.99, + "grad_norm": 4.572494727034468, + "learning_rate": 1.6445587593560964e-10, + "loss": 0.327, + "step": 17605 + }, + { + "epoch": 2.99, + "grad_norm": 4.2630889633032805, + "learning_rate": 1.267906500940219e-10, + "loss": 0.326, + "step": 17610 + }, + { + "epoch": 2.99, + "grad_norm": 4.181006413141019, + "learning_rate": 9.40169350716058e-11, + "loss": 0.3255, + "step": 17615 + }, + { + "epoch": 3.0, + "grad_norm": 4.117134318017797, + "learning_rate": 6.613476293160226e-11, + "loss": 0.3272, + "step": 17620 + }, + { + "epoch": 3.0, + "grad_norm": 4.00361578124429, + "learning_rate": 4.3144160952746096e-11, + "loss": 0.334, + "step": 17625 + }, + { + "epoch": 3.0, + "grad_norm": 4.3128277913830155, + "learning_rate": 2.5045151626490462e-11, + "loss": 0.3257, + "step": 17630 + }, + { + "epoch": 3.0, + "grad_norm": 4.347718473004982, + "learning_rate": 1.1837752659782375e-11, + "loss": 0.3184, + "step": 17635 + }, + { + "epoch": 3.0, + "grad_norm": 4.857624199795862, + "learning_rate": 3.5219769745076237e-12, + "loss": 0.3295, + "step": 17640 + }, + { + "epoch": 3.0, + "grad_norm": 4.70428450020991, + "learning_rate": 9.783270471519502e-14, + "loss": 0.3309, + "step": 17645 + }, + { + "epoch": 3.0, + "eval_loss": 0.273702472448349, + "eval_runtime": 75.049, + "eval_samples_per_second": 4.824, + "eval_steps_per_second": 0.613, + "step": 17646 + }, + { + "epoch": 3.0, + "step": 17646, + "total_flos": 607478932832256.0, + "train_loss": 0.7695021375204772, + "train_runtime": 155405.9935, + "train_samples_per_second": 1.817, + "train_steps_per_second": 0.114 + } + ], + "logging_steps": 5, + "max_steps": 17646, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 607478932832256.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}